In [1]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, GRU, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

In [2]:
# Игрушечный тестовый набор данных
text_data = [
    "Это первый документ.",
    "Этот документ — второй документ.",
    "А это тот самый третий документ.",
    "Это точно первый документ?"
]

In [3]:
# Токенизируем текстовые данные
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_data)
total_words = len(tokenizer.word_index) + 1


In [4]:
tokenizer.word_index

{'документ': 1,
 'это': 2,
 'первый': 3,
 'этот': 4,
 '—': 5,
 'второй': 6,
 'а': 7,
 'тот': 8,
 'самый': 9,
 'третий': 10,
 'точно': 11}

In [5]:
tokenizer.index_word

{1: 'документ',
 2: 'это',
 3: 'первый',
 4: 'этот',
 5: '—',
 6: 'второй',
 7: 'а',
 8: 'тот',
 9: 'самый',
 10: 'третий',
 11: 'точно'}

In [6]:
# Создание входных последовательностей и меток для обучения
input_sequences = []
for line in text_data:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [7]:
input_sequences

[[2, 3],
 [2, 3, 1],
 [4, 1],
 [4, 1, 5],
 [4, 1, 5, 6],
 [4, 1, 5, 6, 1],
 [7, 2],
 [7, 2, 8],
 [7, 2, 8, 9],
 [7, 2, 8, 9, 10],
 [7, 2, 8, 9, 10, 1],
 [2, 11],
 [2, 11, 3],
 [2, 11, 3, 1]]

In [8]:
len(input_sequences)

14

In [9]:
max_sequence_length = max(len(seq) for seq in input_sequences)
max_sequence_length

6

In [10]:
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')
input_sequences

array([[ 0,  0,  0,  0,  2,  3],
       [ 0,  0,  0,  2,  3,  1],
       [ 0,  0,  0,  0,  4,  1],
       [ 0,  0,  0,  4,  1,  5],
       [ 0,  0,  4,  1,  5,  6],
       [ 0,  4,  1,  5,  6,  1],
       [ 0,  0,  0,  0,  7,  2],
       [ 0,  0,  0,  7,  2,  8],
       [ 0,  0,  7,  2,  8,  9],
       [ 0,  7,  2,  8,  9, 10],
       [ 7,  2,  8,  9, 10,  1],
       [ 0,  0,  0,  0,  2, 11],
       [ 0,  0,  0,  2, 11,  3],
       [ 0,  0,  2, 11,  3,  1]], dtype=int32)

In [11]:
X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)


In [12]:
model_rnn = Sequential()
model_rnn.add(Embedding(total_words, 50, input_length=max_sequence_length-1))
model_rnn.add(SimpleRNN(100))
model_rnn.add(Dense(total_words, activation='softmax'))
model_rnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model_rnn.fit(X, y, epochs=100, verbose=0)




<keras.src.callbacks.history.History at 0x24854a9ea40>

In [13]:
model_gru = Sequential()
model_gru.add(Embedding(total_words, 50, input_length=max_sequence_length-1))
model_gru.add(GRU(100))
model_gru.add(Dense(total_words, activation='softmax'))
model_gru.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model_gru.fit(X, y, epochs=100, verbose=0)

<keras.src.callbacks.history.History at 0x24856586770>

In [14]:
model_lstm = Sequential()
model_lstm.add(Embedding(total_words, 50, input_length=max_sequence_length-1))
model_lstm.add(LSTM(100))
model_lstm.add(Dense(total_words, activation='softmax'))
model_lstm.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model_lstm.fit(X, y, epochs=100, verbose=0)

<keras.src.callbacks.history.History at 0x2485919f280>

In [15]:
def generate_text(seed_text, model, max_sequence_len, num_words):
    for _ in range(num_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=-1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

In [16]:
generated_text_rnn = generate_text("Это", model_rnn, max_sequence_length, num_words=5)
generated_text_gru = generate_text("Это", model_gru, max_sequence_length, num_words=5)
generated_text_lstm = generate_text("Это", model_lstm, max_sequence_length, num_words=5)

print("Сгенерированный текст (SimpleRNN):", generated_text_rnn)
print("Сгенерированный текст (GRU):", generated_text_gru)
print("Сгенерированный текст (LSTM):", generated_text_lstm)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 674ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 139ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 129ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 222ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 415ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 276ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s