#### Download data

https://www.kaggle.com/datasets/mrapplexz/bashim-quotes

In [1]:
model_name = "sberbank-ai/rugpt3small_based_on_gpt2"

In [3]:
!pip install transformers[torch]



In [9]:
import logging
import numpy as np
import pandas as pd

from transformers import AutoTokenizer
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments, AutoModelForCausalLM
from transformers.trainer import logger as noisy_logger

In [5]:
noisy_logger.setLevel(logging.WARNING)

In [6]:
df_rec = pd.read_json('/content/sample_data/dataset.jsonl', lines=True).set_index('id')

In [7]:
df_rec.shape

(81497, 3)

In [8]:
df_rec = df_rec.sample(10000)

In [9]:
import re

def clear_text(text):
    clr_text = re.sub(r"<.*?>", " ", text).lower()
    clr_text = summary = re.sub(r"\s", " ", clr_text)
    return clr_text

In [10]:
df_rec["clear_text"] = df_rec["text"].apply(lambda x: clear_text(x))
df_rec.head()

Unnamed: 0_level_0,date,rating,text,clear_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
432420,2015-02-17 05:13:00+00:00,8812.0,Sodolyana: —É—Ç—Ä–æ –Ω–∞—á–∞–ª–æ—Å—å —à–∏–∫–∞—Ä–Ω–æ ) —è –∂ –∂–∏–≤—É –Ω–∞...,sodolyana: —É—Ç—Ä–æ –Ω–∞—á–∞–ª–æ—Å—å —à–∏–∫–∞—Ä–Ω–æ ) —è –∂ –∂–∏–≤—É –Ω–∞...
432860,2015-03-17 04:44:00+00:00,2164.0,–ù–æ–≤–æ—Å—Ç—å –Ω–∞ –æ–ø–µ–Ω–Ω–µ—Ç–µ:\n\n–ü–æ—Å–ª–µ —Ä–µ—Ü–µ–Ω–∑–∏—Ä–æ–≤–∞–Ω–∏—è –º...,–Ω–æ–≤–æ—Å—Ç—å –Ω–∞ –æ–ø–µ–Ω–Ω–µ—Ç–µ: –ø–æ—Å–ª–µ —Ä–µ—Ü–µ–Ω–∑–∏—Ä–æ–≤–∞–Ω–∏—è –º–Ω–æ...
250609,2007-05-29 07:51:00+00:00,2587.0,Kimbol –±–ª—è—Ç—å!!!!! —Ç–æ–ª—å–∫–æ —á—Ç–æ!!!! —Ä–µ–∞–ª—å–Ω–æ!!!! —Å...,kimbol –±–ª—è—Ç—å!!!!! —Ç–æ–ª—å–∫–æ —á—Ç–æ!!!! —Ä–µ–∞–ª—å–Ω–æ!!!! —Å...
445607,2017-07-04 04:45:00+00:00,1526.0,"—Ö—Ö—Ö: –ö–∞–∫, —Ç—ã –Ω–µ —Å–º–æ—Ç—Ä–µ–ª ""–°–µ–º–µ–π–∫—É –ê–¥–∞–º—Å–æ–≤""? –î–∞ ...","—Ö—Ö—Ö: –∫–∞–∫, —Ç—ã –Ω–µ —Å–º–æ—Ç—Ä–µ–ª ""—Å–µ–º–µ–π–∫—É –∞–¥–∞–º—Å–æ–≤""? –¥–∞ ..."
6339,2005-12-26 06:33:00+00:00,6364.0,"pSycho:\n—Å–ª—É—à–∞–π, —Ç—ã –≤–æ—Ç —Ç–∞–º, –≤ –¥–Ω–µ–≤–Ω–∏–∫–µ –ø—Ä–æ –º–æ...","psycho: —Å–ª—É—à–∞–π, —Ç—ã –≤–æ—Ç —Ç–∞–º, –≤ –¥–Ω–µ–≤–Ω–∏–∫–µ –ø—Ä–æ –º–æ—Ä..."


In [11]:
data = df_rec.loc[:, 'clear_text']

In [12]:
data

id
432420    sodolyana: —É—Ç—Ä–æ –Ω–∞—á–∞–ª–æ—Å—å —à–∏–∫–∞—Ä–Ω–æ ) —è –∂ –∂–∏–≤—É –Ω–∞...
432860    –Ω–æ–≤–æ—Å—Ç—å –Ω–∞ –æ–ø–µ–Ω–Ω–µ—Ç–µ:  –ø–æ—Å–ª–µ —Ä–µ—Ü–µ–Ω–∑–∏—Ä–æ–≤–∞–Ω–∏—è –º–Ω–æ...
250609    kimbol –±–ª—è—Ç—å!!!!! —Ç–æ–ª—å–∫–æ —á—Ç–æ!!!! —Ä–µ–∞–ª—å–Ω–æ!!!! —Å...
445607    —Ö—Ö—Ö: –∫–∞–∫, —Ç—ã –Ω–µ —Å–º–æ—Ç—Ä–µ–ª "—Å–µ–º–µ–π–∫—É –∞–¥–∞–º—Å–æ–≤"? –¥–∞ ...
6339      psycho: —Å–ª—É—à–∞–π, —Ç—ã –≤–æ—Ç —Ç–∞–º, –≤ –¥–Ω–µ–≤–Ω–∏–∫–µ –ø—Ä–æ –º–æ—Ä...
                                ...                        
455086    xxx: –æ–±–æ—Ä–∑–µ–≤—à–∏–π –Ω–µ–∑–∞–º–µ–Ω–∏–º—ã–π —Å–æ—Ç—Ä—É–¥–Ω–∏–∫ —Å–æ–∑–¥–∞–≤–∞—Ç...
404027    xxx: –º–æ—è –∂–µ–Ω–∞ –æ—á–µ–Ω—å —Ä–∞–∑–æ—á–∞—Ä–æ–≤–∞–ª–∞—Å—å, —É–∑–Ω–∞–≤, —á—Ç–æ...
424960    xxx: –µ–¥—É –≤ —ç–ª–µ–∫—Ç—Ä–∏—á–∫–µ. –æ–±—ã—á–Ω–æ–µ –¥–µ–ª–æ, –Ω–∏—á–µ–≥–æ –Ω–µ...
420988    xxx: –¥–æ–∂–¥–∞–ª–∏—Å—å, –ø–æ—Å–æ–Ω—ã! xxx: "—á–∞—Å—Ç–Ω–∞—è –æ—Ä–≥–∞–Ω–∏–∑–∞...
440987    alexander: –≤—Å—ë, –ø–æ—Ä–µ—à–∞–ª –≤—Ä–æ–¥–µ) —Å–æ—Ä—Ä–∏ —á—Ç–æ –¥—ë—Ä–≥–∞...
Name: clear_text, Length

In [13]:
import re
from sklearn.model_selection import train_test_split

def build_text_files(data_json, dest_path):
    with open(dest_path, "w", encoding="utf-8") as f:
        data = ''
        for texts in data_json:
            summary = str(texts).strip()
            data += summary + "  "

        f.write(data)

In [14]:
train, test = train_test_split(data, test_size=0.15)

In [15]:
build_text_files(train,'train_dataset.txt')
build_text_files(test,'test_dataset.txt')

In [16]:
print("Train dataset length: "+ str(len(train)))
print("Test dataset length: "+ str(len(test)))

Train dataset length: 8500
Test dataset length: 1500


In [17]:
train[:5]

id
432800    xxx: –∂–µ–Ω—Å–∫–æ–µ –æ –Ω–∞–±–æ–ª–µ–≤—à–µ–º: –æ—Ç –æ–±—É–≤–∏ –æ–∫—Ä–∞—Å–∏–ª–∏—Å—å...
428070    xxx: —Å–æ–±–∏—Ä–∞—é—Å—å —Å–µ–≥–æ–¥–Ω—è –Ω–∞ —Ä–∞–±–æ—Ç—É, –∑–≤–æ–Ω–∏—Ç –¥–æ–º–æ—Ñ...
420127    xxx: —Ç–æ–≥–¥–∞ –µ—â–µ html-—Ç–µ–≥–∏ –±—ã–ª–∏ –±–æ–ª—å—à–∏–º–∏, –∞ java...
450884    xxx: –ø–æ—é—Ç —Ç–∞–º –Ω–µ mascarpone, –∞ moscow calling....
428134      –≤–æ—Ç –Ω–∞–ø—Ä–∏–º–µ—Ä —Å–∫–∞–∂–∏—Ç–µ, –º–æ–∂–Ω–æ –ª–∏ –∑–∞–∞—Ä—Ö–∏–≤–∏—Ä–æ–≤–∞—Ç...
Name: clear_text, dtype: object

In [18]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

train_path = 'train_dataset.txt'
test_path = 'test_dataset.txt'

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [19]:
def load_dataset(train_path, test_path, tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)

    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset, test_dataset, data_collator

train_dataset, test_dataset, data_collator = load_dataset(train_path, test_path, tokenizer)



#### Training model

In [20]:
model = AutoModelForCausalLM.from_pretrained(model_name)

In [37]:
!pip install accelerate -U



In [21]:
training_args = TrainingArguments(

    "phrase",
    evaluation_strategy = "epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    learning_rate=1e-5,
    weight_decay=0.01,
    save_strategy='no',
    report_to='none',

    )

In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [23]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,4.0634,3.942977
2,3.9299,3.934325


TrainOutput(global_step=2192, training_loss=4.021550060188683, metrics={'train_runtime': 429.1637, 'train_samples_per_second': 20.421, 'train_steps_per_second': 5.108, 'total_flos': 572490842112000.0, 'train_loss': 4.021550060188683, 'epoch': 2.0})

#### Text generate

In [24]:
def generate_text(prefix):
    tokens = tokenizer(prefix, return_tensors='pt')
    size = tokens['input_ids'].shape[1]

    output = model.generate(
        **tokens,
        #end_token=end_token_id,
        do_sample=False,
        max_length=size+50,
        early_stopping=True,
        length_penalty=2.0,
        repetition_penalty=8.,
        temperature=0.5,
        num_beams=3,
        no_repeat_ngram_size=5
    )

    decoded = tokenizer.decode(output[0])
    result = decoded[len(prefix):]
    return prefix + result

In [34]:
import torch
if torch.cuda.is_available():  # Tell PyTorch to use the GPU.
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0)) # If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [30]:
device = torch.device("cpu")
model = model.to(device)

In [31]:
print(generate_text("—Ç—ã –µ—â–µ –¥–æ–ª–≥–æ?"))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


—Ç—ã –µ—â–µ –¥–æ–ª–≥–æ?  xxx: —É –Ω–∞—Å –≤ —à–∫–æ–ª–µ –Ω–∞ –ø–µ—Ä–µ–º–µ–Ω–µ —É—á–∏—Ç–µ–ª—å–Ω–∏—Ü–∞ –ø–æ —Ñ–∏–∑–∫—É–ª—å—Ç—É—Ä–µ —Å–∫–∞–∑–∞–ª–∞, —á—Ç–æ –µ—Å–ª–∏ —Ç—ã –Ω–µ –±—É–¥–µ—à—å –¥–µ–ª–∞—Ç—å –∑–∞—Ä—è–¥–∫—É –∫–∞–∂–¥—ã–π –¥–µ–Ω—å, —Ç–æ —á–µ—Ä–µ–∑ –≥–æ–¥ —Ç–µ–±—è –±—É–¥—É—Ç —Å—á–∏—Ç–∞—Ç—å –ª—ã—Å—ã–º. yyy: –Ω—É –≤–æ—Ç –∏ —è –æ —Ç–æ–º


In [32]:
print(generate_text("–¥–∞–≤–∞–π, –ø–æ—à–ª–∏ —É–∂–µ –≤ –º–∞–≥–∞–∑–∏–Ω"))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


–¥–∞–≤–∞–π, –ø–æ—à–ª–∏ —É–∂–µ –≤ –º–∞–≥–∞–∑–∏–Ω.  xxx: –∞ —É –Ω–∞—Å –Ω–∞ —Ä–∞–±–æ—Ç–µ –µ—Å—Ç—å —Ç–∞–∫–∞—è —Ñ–∏—à–∫–∞ - –∫–æ–≥–¥–∞ —è –ø—Ä–∏—Ö–æ–∂—É –¥–æ–º–æ–π —Å —Ä–∞–±–æ—Ç—ã, —Ç–æ —Å—Ä–∞–∑—É –∂–µ –≤–∫–ª—é—á–∞—é —Ç–µ–ª–µ–≤–∏–∑–æ—Ä –∏ –Ω–∞—á–∏–Ω–∞—é —Å–º–æ—Ç—Ä–µ—Ç—å –Ω–æ–≤–æ—Å—Ç–∏ –ø–æ —Ç–µ–ª–∏–∫—É... yyy: —ç—Ç–æ –∫–∞–∫? —Ö—Ö—Ö:


In [33]:
print(generate_text("–¥–∞–≤–∞–π –∫—É–ø–∏–º –∞—Ä–±—É–∑ –≤ –º–∞–≥–∞–∑–∏–Ω–µ?"))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


–¥–∞–≤–∞–π –∫—É–ø–∏–º –∞—Ä–±—É–∑ –≤ –º–∞–≥–∞–∑–∏–Ω–µ?  xxx: —É –º–µ–Ω—è –µ—Å—Ç—å –∑–Ω–∞–∫–æ–º–∞—è, –∫–æ—Ç–æ—Ä–∞—è –∂–∏–≤–µ—Ç —Å —Ä–æ–¥–∏—Ç–µ–ª—è–º–∏. –∏ –≤–æ—Ç –æ–¥–Ω–∞–∂–¥—ã –æ–Ω–∞ –ø—Ä–∏—Ö–æ–¥–∏—Ç –∫ –Ω–∏–º –Ω–∞ —Ä–∞–±–æ—Ç—É - –∞ —Ç–∞–º —Å–∏–¥–∏—Ç –µ–µ –º–∞–º–∞... yyy: –Ω—É —á—Ç–æ –∂ —Ç—ã —Ç–∞–∫ –¥–æ–ª–≥–æ –Ω–µ –ø—Ä–∏—Ö–æ–¥–∏—à—å?! —è –∂–µ —Ç–µ–±–µ –≥–æ–≤–æ—Ä–∏–ª–∞!


#### Data load

–æ–±—É—á–∏—Ç—å –º–æ–¥–µ–ª—å T5/ –∏–ª–∏ GPT –¥–ª—è –≥–µ–Ω–µ—Ä–∞—Ü–∏–∏ –∑–∞–≥–æ–ª–æ–≤–∫–æ–≤ –¥–ª—è —Å—Ç–∞—Ç–µ–π
https://github.com/natasha/corus/load_lenta2  

In [34]:
!pip install corus



In [35]:
!wget https://github.com/yutkin/Lenta.Ru-News-Dataset/releases/download/v1.1/lenta-ru-news.csv.bz2

--2023-08-18 09:15:05--  https://github.com/yutkin/Lenta.Ru-News-Dataset/releases/download/v1.1/lenta-ru-news.csv.bz2
Resolving github.com (github.com)... 140.82.112.3
Connecting to github.com (github.com)|140.82.112.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/87156914/619f9f00-1e96-11ea-946e-dac89df8aced?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20230818%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20230818T091506Z&X-Amz-Expires=300&X-Amz-Signature=06a829609aa533553b56f2b73030b8e08dbeddc83b218ad8510da179294e2b3e&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=87156914&response-content-disposition=attachment%3B%20filename%3Dlenta-ru-news.csv.bz2&response-content-type=application%2Foctet-stream [following]
--2023-08-18 09:15:06--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/87156914/619f9f00-1e96-11ea-946e-da

In [1]:
from corus import load_lenta2

path = 'lenta-ru-news.csv.bz2'
records = load_lenta2(path)
next(records)

LentaRecord(
    url='https://lenta.ru/news/1914/09/16/hungarnn/',
    title='1914. –†—É—Å—Å–∫–∏–µ –≤–æ–π—Å–∫–∞ –≤—Å—Ç—É–ø–∏–ª–∏ –≤\xa0–ø—Ä–µ–¥–µ–ª—ã –í–µ–Ω–≥—Ä–∏–∏  ',
    text='–ë–æ–∏ —É –°–æ–ø–æ—Ü–∫–∏–Ω–∞ –∏ –î—Ä—É—Å–∫–µ–Ω–∏–∫ –∑–∞–∫–æ–Ω—á–∏–ª–∏—Å—å –æ—Ç—Å—Ç—É–ø–ª–µ–Ω–∏–µ–º –≥–µ—Ä–º–∞–Ω—Ü–µ–≤. –ù–µ–ø—Ä–∏—è—Ç–µ–ª—å, –ø—Ä–∏–±–ª–∏–∑–∏–≤—à–∏—Å—å —Å —Å–µ–≤–µ—Ä–∞ –∫ –û—Å–æ–≤—Ü—É –Ω–∞—á–∞–ª –∞—Ä—Ç–∏–ª–ª–µ—Ä–∏–π—Å–∫—É—é –±–æ—Ä—å–±—É —Å –∫—Ä–µ–ø–æ—Å—Ç—å—é. –í –∞—Ä—Ç–∏–ª–ª–µ—Ä–∏–π—Å–∫–æ–º –±–æ—é –ø—Ä–∏–Ω–∏–º–∞—é—Ç —É—á–∞—Å—Ç–∏–µ —Ç—è–∂–µ–ª—ã–µ –∫–∞–ª–∏–±—Ä—ã. –° —Ä–∞–Ω–Ω–µ–≥–æ —É—Ç—Ä–∞ 14 —Å–µ–Ω—Ç—è–±—Ä—è –æ–≥–æ–Ω—å –¥–æ—Å—Ç–∏–≥ –∑–Ω–∞—á–∏—Ç–µ–ª—å–Ω–æ–≥–æ –Ω–∞–ø—Ä—è–∂–µ–Ω–∏—è. –ü–æ–ø—ã—Ç–∫–∞ –≥–µ—Ä–º–∞–Ω—Å–∫–æ–π –ø–µ—Ö–æ—Ç—ã –ø—Ä–æ–±–∏—Ç—å—Å—è –±–ª–∏–∂–µ –∫ –∫—Ä–µ–ø–æ—Å—Ç–∏ –æ—Ç—Ä–∞–∂–µ–Ω–∞. –í –ì–∞–ª–∏—Ü–∏–∏ –º—ã –∑–∞–Ω—è–ª–∏ –î–µ–º–±–∏—Ü—É. –ë–æ–ª—å—à–∞—è –∫–æ–ª–æ–Ω–Ω–∞, –æ—Ç—Å—Ç—É–ø–∞–≤—à–∞—è –ø–æ —à–æ—Å—Å–µ –æ—Ç –ü–µ—Ä–µ–º—ã—à–ª—è –∫ –°–∞–Ω–æ–∫—É, –æ–±—Å—Ç—Ä–µ–ª–∏–≤–∞–ª–∞—Å—å —Å –≤—ã—Å–æ—

In [2]:

def load_lenta_to_list(path, max_number=None):
    records = load_lenta2(path)
    texts, titles = [], []
    for i, record in enumerate(records):
        texts.append(record.text)
        titles.append(record.title)
        if not max_number is None:
            if i >= max_number-1:
                break
    return texts, titles

In [3]:
texts, titles = load_lenta_to_list(path, max_number=20000)
print(len(texts))
print(len(titles))

20000
20000


In [15]:
df = pd.DataFrame({'text':texts, 'title':titles})
df.sample(3)

Unnamed: 0,text,title
8650,"–ê–ª–µ–∫—Å–∞–Ω–¥—Ä –õ–∏–≤—à–∏—Ü, –∑–∞–Ω–∏–º–∞–≤—à–∏–π —Ä–∞–Ω–µ–µ –¥–æ–ª–∂–Ω–æ—Å—Ç—å —Å...",–ê–ª–µ–∫—Å–∞–Ω–¥—Ä –õ–∏–≤—à–∏—Ü —Å—Ç–∞–ª —Å–æ–≤–µ—Ç–Ω–∏–∫–æ–º –ø—Ä–µ–º—å–µ—Ä–∞
4126,–ë–æ–ª–µ–µ 500 —Ç—ã—Å—è—á —Ñ–∏—Ä–º –≤ –†–æ—Å—Å–∏–∏ –∫–æ–Ω—Ç—Ä–æ–ª–∏—Ä—É—é—Ç—Å—è –ø...,–ë–∞–Ω–¥–∏—Ç—ã –∫–æ–Ω—Ç—Ä–æ–ª–∏—Ä—É—é—Ç –±–æ–ª–µ–µ 500¬†—Ç—ã—Å—è—á —Ä–æ—Å—Å–∏–π—Å–∫–∏...
5071,"–í–æ–ø—Ä–æ—Å —Å —Ä–æ—Å—Å–∏–π—Å–∫–∏–º —Ç–∞–Ω–∫–µ—Ä–æ–º ""–í–æ–ª–≥–æ–Ω–µ—Ñ—Ç—å-147"",...","–í–æ–ø—Ä–æ—Å —Å¬†—Ç–∞–Ω–∫–µ—Ä–æ–º ""–í–æ–ª–≥–æ–Ω–µ—Ñ—Ç—å"" –∑–∞–∫—Ä—ã—Ç"


In [16]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.1, random_state=1)

In [41]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m519.3/519.3 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m115.3/115.3 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m194.1/194.1 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 

In [17]:
from datasets import Dataset, DatasetDict

ds_data = DatasetDict({
    'train': Dataset.from_pandas(df_train),
    'test': Dataset.from_pandas(df_test)
})

ds_data

DatasetDict({
    train: Dataset({
        features: ['text', 'title', '__index_level_0__'],
        num_rows: 18000
    })
    test: Dataset({
        features: ['text', 'title', '__index_level_0__'],
        num_rows: 2000
    })
})

In [18]:
max_len_text = max(map(lambda txt: len(txt.split()), ds_data['train']['text']))
max_len_tl = max(map(lambda txt: len(txt.split()), ds_data['train']['title']))
max_len_text, max_len_tl

(1111, 18)

In [19]:
max_len_text, max_len_tl = 512, 20

# Preprocessing the data

In [20]:
model_name = "IlyaGusev/rut5_base_sum_gazeta"

In [21]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [22]:
tokenized_input = tokenizer('–ø—Ä–∏–≤–µ—Ç', padding='max_length', truncation=True, max_length=max_len_text, return_tensors = 'pt')
tokenized_input

{'input_ids': tensor([[20842,     1,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [23]:
def tokenize(batch):
    tokenized_input = tokenizer(batch['text'], padding='max_length', truncation=True, max_length=max_len_text)
    tokenized_label = tokenizer(batch['title'], padding='max_length', truncation=True, max_length=max_len_tl)

    tokenized_input['labels'] = tokenized_label['input_ids']

    return tokenized_input

ds_data = ds_data.map(tokenize, batched=True, batch_size=8)

ds_data.set_format('numpy', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/18000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [24]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer)

# Preprocessing the data

In [25]:
!pip install transformers



In [26]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments
model = T5ForConditionalGeneration.from_pretrained(model_name)

In [27]:
training_args = TrainingArguments(
    "gen_title",
    evaluation_strategy = "epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs= 2,
    remove_unused_columns=True, # Removes useless columns from the dataset
    save_strategy='no',
    report_to='none',
)

In [28]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_data['train'],
    eval_dataset=ds_data['test'],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [29]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: ignored

In [30]:
output_dir = 'lenta2/output'
trainer.save_model(output_dir + '/model')

In [31]:
dataset_test = ds_data['test']

def title_pred(idx):
    input_text = dataset_test['text'][idx]
    input_title = dataset_test['title'][idx]

    use_cuda = False
    device = torch.device("cpu")

    with torch.no_grad():
        tokenized_text = tokenizer(input_text, truncation=True, padding=True, return_tensors='pt').to(device)
        source_ids = tokenized_text['input_ids'].to(dtype = torch.long)
        source_mask = tokenized_text['attention_mask'].to(dtype = torch.long)
        generated_ids = model.generate(
            input_ids = source_ids,
            attention_mask = source_mask,
            max_length=1512,
            num_beams=7,
            temperature = 1.3,
            repetition_penalty=1,
            length_penalty=1,
            early_stopping=True,
            no_repeat_ngram_size=2
            ).to(device)
        pred = tokenizer.decode(generated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

    print("Text:\n" + input_text)
    print("Real title: " + input_title)
    print("Pred title: " + pred)

In [35]:
device = torch.device("cpu")
model = model.to(device)

In [36]:
title_pred(1)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Text:
–®–∏–∫–∞—Ä–Ω—ã–µ –±—É—Ç–∏–∫–∏, —Ä–∞—Å–ø–æ–ª–æ–∂–µ–Ω–Ω—ã–µ –≤ —Ü–µ–Ω—Ç—Ä–µ –ù—å—é-–ô–æ—Ä–∫–∞, —Ç–µ—Ä–ø—è—Ç –∫–æ–ª–æ—Å—Å–∞–ª—å–Ω—ã–µ —É–±—ã—Ç–∫–∏ –∏–∑-–∑–∞ —Ç–æ–ø-–º–æ–¥–µ–ª–µ–π –∏ –∞–∫—Ç—Ä–∏—Å. –£ –ø—Ä–∏–∑–Ω–∞–Ω–Ω—ã—Ö –∫—Ä–∞—Å–∞–≤–∏—Ü –ø–æ—è–≤–ª–æ—Å—å –Ω–æ–≤–æ–µ —Ö–æ–±–±–∏ - –∫—Ä–∞—Å—Ç—å –∏–∑ –¥–æ—Ä–æ–≥–∏—Ö –º–∞–≥–∞–∑–∏–Ω–æ–≤ –∫–∞–∫–∏–µ-–Ω–∏–±—É–¥—å –≤–µ—â–∏. –í —Ä–µ–∑—É–ª—å—Ç–∞—Ç–µ –≤ –±—É—Ç–∏–∫–∞—Ö Dolce & Gabbana, Prada –∏ Burberry's, —Ä–∞—Å–ø–æ–ª–æ–∂–µ–Ω–Ω—ã—Ö –Ω–∞ –ú—ç–¥–∏—Å–æ–Ω –∞–≤–µ–Ω—é –∏ –ü—è—Ç–æ–π –∞–≤–µ–Ω—é, —É—Å–∏–ª–µ–Ω–∞ –æ—Ö—Ä–∞–Ω–∞ –∏ —É—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω—ã –¥–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω—ã–µ –∫–∞–º–µ—Ä—ã –Ω–∞–±–ª—é–¥–µ–Ω–∏—è. –ó–∞ –ø–æ—Å–ª–µ–¥–Ω–∏–µ –ø–æ–ª–≥–æ–¥–∞ –Ω–∞ –∫—Ä–∞–∂–µ –≤ –±—É—Ç–∏–∫–∞—Ö –±—ã–ª–∏ –ø–æ–π–º–∞–Ω—ã —Ç—Ä–∏ –∏–∑–≤–µ—Å—Ç–Ω—ã–µ —Ç–æ–ø-–º–æ–¥–µ–ª–∏, —Å–æ–æ–±—â–∞–µ—Ç Ananova.com. –í—Å—è–∫–∏–π —Ä–∞–∑ —É–≥–æ–ª–æ–≤–Ω–æ–º—É –¥–µ–ª—É –Ω–µ –¥–∞–≤–∞–ª–∏ —Ö–æ–¥, –ø–æ—Ç–æ–º—É —á—Ç–æ –≤ —Å–∏—Ç—É–∞—Ü–∏—é –≤–º–µ—à–∏–≤–∞–ª–∏—Å—å –±–æ–≥–∞—Ç—ã–µ –ø–æ–∫–ª–æ–Ω–Ω–∏–∫–∏ –º

In [37]:
title_pred(2)

Text:
–£–∂–µ –Ω–∞ —Å–ª–µ–¥—É—é—â–µ–π –Ω–µ–¥–µ–ª–µ –Ω–∞ –£–∫—Ä–∞–∏–Ω–µ –º–æ–∂–µ—Ç –±—ã—Ç—å –≤–≤–µ–¥–µ–Ω–æ —á—Ä–µ–∑–≤—ã—á–∞–π–Ω–æ–µ –ø–æ–ª–æ–∂–µ–Ω–∏–µ. –ü–æ —Å–æ–æ–±—â–µ–Ω–∏—é —É–∫—Ä–∞–∏–Ω—Å–∫–∏—Ö –≤–ª–∞—Å—Ç–µ–π, —ç—Ç–æ —Å–≤—è–∑–∞–Ω–æ —Å —Ç—è–∂–µ–ª–µ–π—à–∏–º —ç–Ω–µ—Ä–≥–µ—Ç–∏—á–µ—Å–∫–∏–º –∫—Ä–∏–∑–∏—Å–æ–º, —Ä–∞–∑—Ä–∞–∑–∏–≤—à–∏–º—Å—è –≤ —Å—Ç—Ä–∞–Ω–µ. –ö–∞–∫ —Å–æ–æ–±—â–∏–ª–∞ –≤ –∏–Ω—Ç–µ—Ä–≤—å—é –í–í–° –≤–∏—Ü–µ-–ø—Ä–µ–º—å–µ—Ä –£–∫—Ä–∞–∏–Ω—ã –Æ–ª–∏—è –¢–∏–º–æ—à–µ–Ω–∫–æ,  –ø–æ–¥–∞—á–∞ —ç–ª–µ–∫—Ç—Ä–æ—ç–Ω–µ—Ä–≥–∏–∏ –≤ –Ω–µ–∫–æ—Ç–æ—Ä—ã—Ö —Ä–∞–π–æ–Ω–∞—Ö –∏–Ω–æ–≥–¥–∞ –ø—Ä–µ—Ä—ã–≤–∞–µ—Ç—Å—è  –Ω–∞ –¥–≤–µ–Ω–∞–¥—Ü–∞—Ç—å —á–∞—Å–æ–≤. "–ó–∞–ø–∞—Å—ã —Ç–æ–ø–ª–∏–≤–∞ –∏ –≥–æ—Ä—é—á–µ–≥–æ –∏—Å—Å—è–∫–∞—é—Ç. –í –Ω–∞—Å—Ç–æ—è—â–∏–π –º–æ–º–µ–Ω—Ç —Ä–∞–±–æ—Ç–∞–µ—Ç –ª–∏—à—å –æ–¥–Ω–∞ —Ç—Ä–µ—Ç—å —É–∫—Ä–∞–∏–Ω—Å–∫–∏—Ö –Ω–µ—Ñ—Ç–µ–ø–µ—Ä–µ—Ä–∞–±–∞—Ç—ã–≤–∞—é—â–∏—Ö –∑–∞–≤–æ–¥–æ–≤", - –∑–∞—è–≤–∏–ª–∞ –¢–∏–º–æ—à–µ–Ω–∫–æ. –£–∫—Ä–∞–∏–Ω—Å–∫–∏–π –¥–æ–ª–≥ –†–æ—Å—Å–∏–∏ –∑–∞ –ø–æ—Å—Ç–∞–≤–∫—É —ç–Ω–µ—Ä–≥–æ–Ω–æ—Å–∏—Ç–µ–ª–µ–π —Ä–µ–∑–∫–æ –≤–æ–

In [38]:
title_pred(3)

Text:
–í –ø–æ–¥–º–æ—Å–∫–æ–≤–Ω–æ–º –°–µ—Ä–≥–∏–µ–≤–æ–º –ü–æ—Å–∞–¥–µ —É–±–∏—Ç –¥–∏—Ä–µ–∫—Ç–æ—Ä –º—É–Ω–∏—Ü–∏–ø–∞–ª—å–Ω–æ–≥–æ –ø—Ä–µ–¥–ø—Ä–∏—è—Ç–∏—è "–≠–ª–µ–∫—Ç—Ä–æ—Å–µ—Ç—å" –ì–µ–Ω–Ω–∞–¥–∏–π –ï–≤–¥–æ–∫–∏–º–æ–≤. –ö–∞–∫ —Å–æ–æ–±—â–∏–ª–∏ "–ò–Ω—Ç–µ—Ä—Ñ–∞–∫—Å—É" –≤ –ì–£–í–î –ú–æ—Å–∫–æ–≤—Å–∫–æ–π –æ–±–ª–∞—Å—Ç–∏, –≤ —Å—Ä–µ–¥—É –æ–∫–æ–ª–æ 8 —á–∞—Å–æ–≤ —É—Ç—Ä–∞ –Ω–∞ —É–ª–∏—Ü–µ –§–∞–±—Ä–∏—á–Ω–æ–π –Ω–µ–∏–∑–≤–µ—Å—Ç–Ω—ã–π –ø—Ä–µ—Å—Ç—É–ø–Ω–∏–∫ –≤—ã—Å—Ç—Ä–µ–ª–∏–ª –µ–º—É –≤ –≥–æ–ª–æ–≤—É. –¢—Ä—É–ø –±—ã–ª –Ω–∞–π–¥–µ–Ω –≤—Å–µ–≥–æ –≤ –Ω–µ—Å–∫–æ–ª—å–∫–∏—Ö –º–µ—Ç—Ä–∞—Ö –æ—Ç –æ—Ñ–∏—Å–∞ –ø—Ä–µ–¥–ø—Ä–∏—è—Ç–∏—è. –° 1990 –ø–æ 1995 –≥–æ–¥ –ì–µ–Ω–Ω–∞–¥–∏–π –ï–≤–¥–æ–∫–∏–º–æ–≤ –∑–∞–Ω–∏–º–∞–ª –ø–æ—Å—Ç –∑–∞–º–µ—Å—Ç–∏—Ç–µ–ª—è –≥–ª–∞–≤—ã –∞–¥–º–∏–Ω–∏—Å—Ç—Ä–∞—Ü–∏–∏ —Ä–∞–π–æ–Ω–∞ –∏ –∫—É—Ä–∏—Ä–æ–≤–∞–ª –≤–æ–ø—Ä–æ—Å—ã —Å—Ç—Ä–æ–∏—Ç–µ–ª—å—Å—Ç–≤–∞, –∑–∞—Ç–µ–º –Ω–µ–∫–æ—Ç–æ—Ä–æ–µ –≤—Ä–µ–º—è —Ä–∞–±–æ—Ç–∞–ª –≤ –∫–æ–º–º–µ—Ä—á–µ—Å–∫–æ–π —Å—Ç—Ä—É–∫—Ç—É—Ä–µ, –ø–æ–∑–∂–µ - –≥–ª–∞–≤–Ω—ã–º –∏–Ω–∂–µ–Ω–µ—Ä–æ–º –Ω–∞ –º–µ—Å—Ç–Ω–æ–º —Å—Ç–µ–∫–æ–ª—å–Ω–

In [39]:
title_pred(4)

Text:
–†–æ—Å—Å–∏–π—Å–∫–æ–µ –ø—Ä–∞–≤–∏—Ç–µ–ª—å—Å—Ç–≤–æ –ø—Ä–∏–Ω—è–ª–æ —Ä–µ—à–µ–Ω–∏–µ –≤—ã–ø–ª–∞—Ç–∏—Ç—å –ø–µ–Ω—Å–∏–æ–Ω–µ—Ä–∞–º —è–Ω–≤–∞—Ä—Å–∫—É—é –ø–µ–Ω—Å–∏—é –≤ –¥–µ–∫–∞–±—Ä–µ, –∑–∞—è–≤–∏–ª –ø—Ä–µ–º—å–µ—Ä-–º–∏–Ω–∏—Å—Ç—Ä –†–æ—Å—Å–∏–∏ –í–ª–∞–¥–∏–º–∏—Ä –ü—É—Ç–∏–Ω, –≤—ã—Å—Ç—É–ø–∞—è –≤–æ –≤—Ç–æ—Ä–Ω–∏–∫ –Ω–∞ –∑–∞—Å–µ–¥–∞–Ω–∏–∏ –æ—Ä–≥–∫–æ–º–∏—Ç–µ—Ç–∞ –ø–æ –ø—Ä–æ–≤–µ–¥–µ–Ω–∏—é –ì–æ–¥–∞ –ø–æ–∂–∏–ª—ã—Ö –ª—é–¥–µ–π. –ü—É—Ç–∏–Ω –ø–æ–¥—á–µ—Ä–∫–Ω—É–ª, —á—Ç–æ —ç—Ç–æ –ª–∏—à—å –ø–µ—Ä–≤–∞—è —á–∞—Å—Ç—å –ø–æ–¥–∞—Ä–∫–∞ —Ä–æ—Å—Å–∏–π—Å–∫–∏–º –ø–µ–Ω—Å–∏–æ–Ω–µ—Ä–∞–º - –≤ —Ñ–µ–≤—Ä–∞–ª–µ –ø–µ–Ω—Å–∏–∏ –±—É–¥—É—Ç –µ—â–µ –∏ –ø—Ä–æ–∏–Ω–¥–µ–∫—Å–∏—Ä–æ–≤–∞–Ω—ã. –ö–∞–∫ –æ—Ç–º–µ—Ç–∏–ª –ø—Ä–µ–º—å–µ—Ä, –ø–æ–º–æ—â—å –Ω–∞ —ç—Ç–æ–º –Ω–∞–ø—Ä–∞–≤–ª–µ–Ω–∏–∏ –±—É–¥–µ—Ç –∏–¥—Ç–∏ "–ø–æ—ç—Ç–∞–ø–Ω–æ, —à–∞–≥ –∑–∞ —à–∞–≥–æ–º". –ü—É—Ç–∏–Ω –Ω–∞–ø–æ–º–Ω–∏–ª, —á—Ç–æ —Å 1 –Ω–æ—è–±—Ä—è –ø–µ–Ω—Å–∏–∏ —É–∂–µ –±—ã–ª–∏ –ø—Ä–æ–∏–Ω–¥–µ–∫—Å–∏—Ä–æ–≤–∞–Ω—ã –Ω–∞ 15 –ø—Ä–æ—Ü–µ–Ω—Ç–æ–≤. –û–Ω –ø–æ–¥—á–µ—Ä–∫–Ω—É–ª, —á—Ç–æ –≤ —ç—Ç–æ–º –≥–æ–¥—É –ø—Ä–∞–≤–∏—Ç–µ

–ú–æ–¥–µ–ª—å –ø–æ–∫–∞–∑–∞–ª–∞ —Ö–æ—Ä–æ—à–∏–π —Ä–µ–∑—É–ª—å—Ç–∞—Ç —Ö–æ—Ç—å –æ–±—É—á–∞–ª–∞—Å—å –º–µ–Ω—å—à–µ –º–∏–Ω—É—Ç—ã, –ø–æ–ª–Ω–æ–µ –¥–æ–æ–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏ –∑–∞–Ω—è–ª–æ –±—ã –æ–∫–æ–ª–æ 3 —á–∞—Å–æ–≤. –¢—è–∂–µ–ª–æ —Ä–µ–∞–ª–∏–∑–æ–≤–∞—Ç—å –Ω–∞ –Ω–æ—É—Ç–±—É–∫–µ —Ç–∞–∫ –∫–∞–∫ —Ç—Ä–µ–±—É–µ—Ç –±–æ–ª—å—à–∏—Ö —Ä–µ—Å—É—Ä—Å–æ–≤ –≤–∏–¥–µ–æ–ø–∞–º—è—Ç–∏.