**Однонаправленная LSTM с пословной токенизацией:**

In [None]:
import numpy as np
import os

import tensorflow as tf
from tensorflow.data import Dataset, AUTOTUNE
from tensorflow import keras
import keras.layers as l

from keras import models, callbacks, utils, losses

In [None]:
from typing import List
from bs4 import BeautifulSoup
import requests
import os
def request_url(url: str) -> BeautifulSoup:
    request = requests.get(url)
    soup = BeautifulSoup(request.content, 'html.parser')
    return soup


def get_url_data(url: str) -> List[str]:
    soup = request_url(url)
    scrapped_text = []

    h1 = soup.h1.text.strip()
    p = soup.find_all('p')

    scrapped_text.append(h1)
    scrapped_text.extend([p_i.text.strip() for p_i in p])

    return scrapped_text

def get_data(url: str) -> str:
    soup = request_url(url)
    text = []

    text.extend([
        soup.h1.text.strip() + '.',
        soup.h2.text.strip() + '.',
        soup.article.p.text.strip()
    ])

    url_chapters = [link.get('href') for link in soup.find_all('a', class_='link')]

    for url in url_chapters:
        scrapped_text = get_url_data(url)
        text.extend(scrapped_text)

    text = ' '.join(text).lower()

    return text

In [None]:
def load_data(url: str, file_name: str, path_dir: str = 'data/') -> str:
    if os.path.isdir(path_dir) == False:
        os.mkdir(path_dir)
        print(f'Created {path_dir} directory')

    path_file = f'{path_dir}{file_name}'

    try:
        with open(path_file, 'r', encoding='utf-8') as file:
            text = file.read()

        print('Uploaded from', path_file)

    except:
        text = get_data(url)

        with open(path_file, 'w', encoding='utf-8') as file:
            file.write(text)

        print('Saved to', path_file)

    return text

In [None]:
text = load_data('https://hpmor.ru/', 'hpmor.txt')

Created data/ directory
Saved to data/hpmor.txt


In [None]:
import re
def split_into_sentences(text: str, regex: str = '[^а-яА-ЯёЁ0-9 ,-]') -> List[str]:
    sentences = [re.sub(regex, '', s).strip() for s in text.split('.')]
    sentences = list(filter(None, sentences))
    return sentences

In [None]:
text_word = split_into_sentences(text, "[^а-яА-ЯёЁ0-9 ,-]")

In [None]:
text_word = text_word[:1000000]

**Токенизация**

In [None]:
dictionary = np.array(sorted(set(' '.join(text_word).split(' '))))
dictionary

array(['', ',', '-', ..., 'ёовтен', 'ёрзал', 'ёрзали'], dtype='<U70')

In [None]:
tokenizer_word_to_index_word = {char: i for i, char in enumerate(dictionary)}
tokenizer_index_to_word_word = {i: char for i, char in enumerate(dictionary)}

**Формирование датасета:**

In [None]:
vectorized_text = np.array([tokenizer_word_to_index_word[word] for word in ' '.join(text_word).split()])

In [None]:
vectorized_text = Dataset.from_tensor_slices(vectorized_text)

In [None]:
VOCAB_LEN = len(tokenizer_word_to_index_word.items())
BATCH_SIZE = 100

In [None]:
sequences = vectorized_text.batch(BATCH_SIZE, drop_remainder=True)

In [None]:
BATCHES_PER_EPOCH = len(sequences) // BATCH_SIZE

In [None]:
from typing import Tuple, List
def get_features_target(seq: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
    features = seq[:-1]
    target = seq[1:]
    return features, target

In [None]:
df = sequences.map(get_features_target).batch(BATCH_SIZE, drop_remainder=True).repeat()
df = df.prefetch(AUTOTUNE)

**Построение и обучение модели:**

In [None]:
vectorizer = keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=len(dictionary),
    input_shape=(1,))

In [None]:
import keras.layers as l
model = keras.Sequential([
    l.Embedding(len(dictionary), BATCH_SIZE, batch_input_shape=[BATCH_SIZE, None]),
    l.LSTM(64, return_sequences=True, stateful=True),
    l.Dense(len(dictionary))
])

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (100, None, 100)          6108100   
                                                                 
 lstm (LSTM)                 (100, None, 64)           42240     
                                                                 
 dense (Dense)               (100, None, 61081)        3970265   
                                                                 
Total params: 10120605 (38.61 MB)
Trainable params: 10120605 (38.61 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
model.compile(
    optimizer='adam',
    loss=losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

In [None]:
preds = model.fit(
    df,
    epochs=100,
    steps_per_epoch=BATCHES_PER_EPOCH
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78