In [1]:
from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer
import torch
from torch.optim import Adam
from torch.utils.data import Dataset,DataLoader
import tqdm
from pathlib import Path
import os

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
binary_path: F:\projects\python_notebook\myenv\lib\site-packages\bitsandbytes\cuda_setup\libbitsandbytes_cuda116.dll
CUDA SETUP: Loading binary F:\projects\python_notebook\myenv\lib\site-packages\bitsandbytes\cuda_setup\libbitsandbytes_cuda116.dll...


In [2]:
# tokenizer path
save_path = 'tokenized_data'

tokenizer = GPT2Tokenizer.from_pretrained(save_path)
tokenizer.add_special_tokens({
  "eos_token": "</s>",
  "bos_token": "<s>",
  "unk_token": "<unk>",
  "pad_token": "<pad>",
  "mask_token": "<mask>"
})# creating the configurations from which the model can be made
config = GPT2Config(
  vocab_size=tokenizer.vocab_size,
  bos_token_id=tokenizer.bos_token_id,
  eos_token_id=tokenizer.eos_token_id
)# creating the model
model = GPT2LMHeadModel(config)

In [3]:
# text files (articles)
paths = [str(x) for x in Path("./fa_corpus/").glob("**/*.txt")]

In [4]:
len(paths)

714274

In [5]:
paths = paths[:274] #for test

# Creating Data set

In [6]:
single_string =''
for filename in paths:
  with open(filename, "r", encoding='utf-8') as f:
   x = f.read()
  single_string += x + tokenizer.eos_token
string_tokenized = tokenizer.encode(single_string)


In [7]:
class CustomDataset(Dataset):
    def __init__(self, inputs, labels):
        super().__init__()
        self.inputs = inputs
        self.labels = labels
    
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, idx):
        input_data = self.inputs[idx]
        label = self.labels[idx]
        return torch.tensor(input_data), torch.tensor(label)
        # return input_data, label

examples = []
block_size = 100
BATCH_SIZE = 16
LEARNING_RATE = 1e-3
EPOCHS = 10

for i in range(0, len(string_tokenized) - block_size + 1, block_size):
  examples.append(string_tokenized[i:i + block_size])

inputs, labels = [], []

for ex in examples:
  inputs.append(ex[:-1])
  labels.append(ex[1:])

dataset = CustomDataset(inputs, labels)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

# Train model

In [8]:
# model.resize_token_embeddings(len(tokenizer))
model = model.to(device)
model.train()
optimizer = Adam(model.parameters(), lr=LEARNING_RATE)

In [9]:
def train(dataloader, model, optimizer):

    for i in tqdm.tqdm(range(EPOCHS)):
        for X, Y in dataloader:
            X = X.to(device)
            Y = Y.to(device)
            optimizer.zero_grad()
            loss = model(X, labels=X).loss
            loss.backward()
            optimizer.step()
        ## save model to directory
        output_dir = './model/'# creating directory if it is not present
        if not os.path.exists(output_dir):
          os.mkdir(output_dir)
        torch.save(model.state_dict(), f"{output_dir}model_fa.pt")


In [10]:
train(dataloader, model, optimizer)

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [03:42<00:00, 22.24s/it]


# Testing inference

In [11]:
def infer(inp):
    # todo model in infernence mod
    inp = tokenizer(inp, return_tensors="pt")
    X = inp["input_ids"].to(device)
    a = inp["attention_mask"].to(device)
    output = model.generate(
        X, 
        attention_mask=a,
        max_length = 50,
        num_beams = 5,
        temperature = 0.7,
        no_repeat_ngram_size=2,
        num_return_sequences=5)
    output = tokenizer.decode(output[0])
    return output


# from transformers import WEIGHTS_NAME, CONFIG_NAME
# output_dir = './model_bn_custom/'# creating directory if it is not present
# if not os.path.exists(output_dir):
#   os.mkdir(output_dir)
# model_to_save = model.module if hasattr(model, 'module') else modeloutput_model_file = os.path.join(output_dir, WEIGHTS_NAME)
# output_config_file = os.path.join(output_dir, CONFIG_NAME)# save model and model configs
# model.save_pretrained(output_dir)
# model_to_save.config.to_json_file(output_config_file)# save tokenizer
# tokenizer.save_pretrained(output_dir)


# tokenizer = GPT2Tokenizer.from_pretrained(output_dir)
# model = TFGPT2LMHeadModel.from_pretrained(output_dir)

In [14]:
infer("جستجو های یافت شده در")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'جستجو های یافت شده در اثر این اثر در تاریخ با شماره ثبت به عنوان یکی از آثار ملی ایران به ثبت رسیده است جستارهای وابسته فهرست آثار دوره قاجاریان در شهرستان سازمان میراث فرهنگی صنایع دستی گردشگری منابع</s></s></s></s></s></s></s></s></s>'