In [None]:
# Загрузка данных
with open('/content/Viking.txt', 'r', encoding='utf-8') as file:
    script_lines = file.read()

In [None]:
import re

def extract_ragnar_dialogues(text):
    ragnar_dialogues = []
    lines = text.split("\n")
    for line in lines:
        if "Ragnar:" in line:
            dialogue = re.sub(r'\([^)]*\)', '', line)
            dialogue = re.sub(r'\<[^>]*\>', '', dialogue)
            dialogue = re.sub(r'Ragnar:', '', dialogue)
            ragnar_dialogues.append(dialogue.strip())
        elif re.search(r'\bRagnar\b(?!\:)', line):
            continue

    return ragnar_dialogues

In [None]:
ragnar_dialogues = extract_ragnar_dialogues(script_lines)
for dialogue in ragnar_dialogues:
    print(dialogue)

In [None]:
def save_dialogues_to_txt(dialogues, filename):
    with open(filename, "w", encoding="utf-8") as file:
        for dialogue in dialogues:
            file.write(dialogue + "\n")

# Применяем функцию для сохранения диалогов в файл
save_dialogues_to_txt(ragnar_dialogues, "ragnar_dialogues.txt")

In [3]:
import re
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize, sent_tokenize

# Загрузим и прочитаем файл с диалогами
with open('/content/ragnar_dialogues.txt', 'r', encoding='utf-8') as file:
    dialogues = file.read()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# Удаление специальных символов и цифр
dialogues_clean = re.sub(r'[^a-zA-Zа-яА-Я\s]', '', dialogues)

# Нормализация: приведение к нижнему регистру
dialogues_lower = dialogues_clean.lower()

# Токенизация по предложениям
sentences = sent_tokenize(dialogues_lower)

print(sentences)

['the earl will deal with some criminal offences and then well discuss the summer raids\nthe earl decides\ni went to confess my love to her but i was set upon by a bear and an enormous hound who guarded her home\nand\nthats right\nsay i wasnt there\nwhy dont you go lie down hmm\ni know\nnow this candle is the sun\nthats it\nhe wants to die well without fear to atone for his sins\nhe should not have done that\nmy lord we all want to feast but we also want to know where we will be raiding this summer\nevery year we go to the same places\nlet me see\nstay with your uncle\ni cant be sure\nto talk to the gods\ni want to know what the gods have in store\nto have this great future must i challenge the law\nwait outside\nwe have someone special to visit\nyes only different\nfloki this is my son bjorn\nwhy unfortunately\nfloki is a boatbuilder\nso what about our boat\nthe hull is deeper how will my men set their oars\nfor the anchor\nhmm\nlet him stay awhile\nhey\nwe are brothers\ni saw somethi

In [5]:
from nltk.probability import FreqDist

# Токенизация по словам для всего текста
words = word_tokenize(dialogues_lower)

# Частотный анализ слов
fdist = FreqDist(words)

print("Наиболее часто встречающиеся слова:")
print(fdist.most_common(10))

Наиболее часто встречающиеся слова:
[('to', 75), ('you', 71), ('the', 67), ('i', 56), ('we', 33), ('and', 27), ('is', 27), ('a', 24), ('this', 22), ('are', 22)]


In [6]:
from nltk.corpus import stopwords
nltk.download('stopwords')

# Получение списка стоп-слов для английского языка
stop_words = set(stopwords.words('english'))

# Фильтрация слов, исключая стоп-слова
filtered_words = [word for word in words if word not in stop_words]

# Частотный анализ отфильтрованных слов
fdist_filtered = FreqDist(filtered_words)

print("Наиболее часто встречающиеся слова после исключения стоп-слов:")
print(fdist_filtered.most_common(10))

Наиболее часто встречающиеся слова после исключения стоп-слов:
[('want', 13), ('us', 13), ('go', 12), ('know', 10), ('vikings', 10), ('well', 9), ('dont', 9), ('see', 9), ('previously', 9), ('gods', 8)]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
!pip install -q transformers torch

In [8]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [10]:
from torch.utils.data import Dataset, DataLoader
import torch

class RagnarDialoguesDataset(Dataset):
    def __init__(self, dialogues, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.attn_masks = []
        for dialogue in dialogues:
            encodings_dict = tokenizer(dialogue, truncation=True,
                                        max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [11]:
# Установка токена заполнения
tokenizer.pad_token = tokenizer.eos_token

# Создание датасета
dataset = RagnarDialoguesDataset(sentences, tokenizer, max_length=512)

# DataLoader
loader = DataLoader(dataset, batch_size=4, shuffle=True)

In [18]:
# Обучение модели
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Определение оптимизатора и цикла обучения
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)

In [13]:
for epoch in range(500):  # Примерное количество эпох
    model.train()
    for batch in loader:
        inputs, masks = batch
        inputs, masks = inputs.to(device), masks.to(device)
        outputs = model(inputs, attention_mask=masks, labels=inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(f"Epoch {epoch} loss: {loss.item()}")

Epoch 0 loss: 4.440732955932617
Epoch 1 loss: 4.297510623931885
Epoch 2 loss: 4.249654293060303
Epoch 3 loss: 4.1080641746521
Epoch 4 loss: 4.191305160522461
Epoch 5 loss: 4.0749125480651855
Epoch 6 loss: 4.003146648406982
Epoch 7 loss: 3.9972667694091797
Epoch 8 loss: 3.931884765625
Epoch 9 loss: 3.833780288696289
Epoch 10 loss: 3.8089518547058105
Epoch 11 loss: 3.7465269565582275
Epoch 12 loss: 3.7382969856262207
Epoch 13 loss: 3.70090913772583
Epoch 14 loss: 3.606881856918335
Epoch 15 loss: 3.568739414215088
Epoch 16 loss: 3.6145660877227783
Epoch 17 loss: 3.49996018409729
Epoch 18 loss: 3.4611473083496094
Epoch 19 loss: 3.4567511081695557
Epoch 20 loss: 3.409980058670044
Epoch 21 loss: 3.3419911861419678
Epoch 22 loss: 3.3816494941711426
Epoch 23 loss: 3.2625577449798584
Epoch 24 loss: 3.2483506202697754
Epoch 25 loss: 3.2207300662994385
Epoch 26 loss: 3.1264495849609375
Epoch 27 loss: 3.1917054653167725
Epoch 28 loss: 3.0889008045196533
Epoch 29 loss: 3.071927309036255
Epoch 30 lo

In [25]:
input_text = "How are you?"
input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)
output = model.generate(
    input_ids,
    max_length=15,
    num_beams=3,
    early_stopping=True,
    temperature=0.9,
    top_k=50,
    top_p=0.92
)
print(tokenizer.decode(output[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


How are you?

I was just having an argument with some friends


In [17]:
model.save_pretrained("/content/drive/MyDrive/Skillbox/model")
tokenizer.save_pretrained("/content/drive/MyDrive/Skillbox/tokenizer")

('/content/drive/MyDrive/Skillbox/tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/Skillbox/tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/Skillbox/tokenizer/vocab.json',
 '/content/drive/MyDrive/Skillbox/tokenizer/merges.txt',
 '/content/drive/MyDrive/Skillbox/tokenizer/added_tokens.json')

In [15]:
model.save_pretrained("/model")
tokenizer.save_pretrained("/tokenizer")

('/tokenizer/tokenizer_config.json',
 '/tokenizer/special_tokens_map.json',
 '/tokenizer/vocab.json',
 '/tokenizer/merges.txt',
 '/tokenizer/added_tokens.json')