In [1]:
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense, Input
from keras.utils import to_categorical
import random

# Step 1: Load the data
with open('data.txt', 'r') as file:
    data = file.read().lower()  # Load and convert to lowercase

# Step 2: Preprocess the data
# Create a list of all unique characters in the data
chars = sorted(list(set(data)))  # Sort to keep things consistent
print(f"Total unique characters: {len(chars)}")

# Create a character-to-index and index-to-character mapping
char_to_index = {char: idx for idx, char in enumerate(chars)}
index_to_char = {idx: char for idx, char in enumerate(chars)}

# Step 3: Prepare the sequences (X and Y)
sequence_length = 100  # Length of input sequences
X = []
Y = []

for i in range(0, len(data) - sequence_length):
    seq_in = data[i:i + sequence_length]
    seq_out = data[i + sequence_length]
    X.append([char_to_index[char] for char in seq_in])
    Y.append(char_to_index[seq_out])

# Reshape X to be [samples, time steps, features]
X = np.reshape(X, (len(X), sequence_length, 1))

# Normalize X by dividing by the total number of unique characters
X = X / float(len(chars))

# One-hot encode the labels Y
Y = to_categorical(Y, num_classes=len(chars))

# Step 4: Build the LSTM model
model = Sequential()
model.add(Input(shape=(X.shape[1], X.shape[2])))  # Input layer
model.add(LSTM(128, return_sequences=True))  # LSTM layer 1
model.add(LSTM(128))  # LSTM layer 2
model.add(Dense(len(chars), activation='softmax'))  # Output layer

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Step 5: Train the model
model.fit(X, Y, epochs=20, batch_size=64)

# Step 6: Generate new sequence based on a seed
def generate_sequence(seed, length=200):
    # Start with the seed sequence
    seed = seed.lower()  # Convert seed to lowercase
    pattern = [char_to_index[char] for char in seed]
    output = seed
    
    # Generate characters
    for _ in range(length):
        x = np.reshape(pattern, (1, len(pattern), 1))  # Reshape for LSTM input
        x = x / float(len(chars))  # Normalize input
        predicted = model.predict(x, verbose=0)
        
        # Get the index of the predicted character
        index = np.argmax(predicted)
        result = index_to_char[index]
        
        # Append the predicted character to the output and update the pattern
        output += result
        pattern.append(index)
        pattern = pattern[1:len(pattern)]  # Keep the pattern the same length
        
    return output

# Example: Generate a sequence starting with the seed 'kanker'
generated_text = generate_sequence('kanker', 500)
print(generated_text)

Total unique characters: 48
Epoch 1/20
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 105ms/step - loss: 3.1386
Epoch 2/20
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 110ms/step - loss: 3.0003
Epoch 3/20
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 103ms/step - loss: 2.9884
Epoch 4/20
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 107ms/step - loss: 2.9735
Epoch 5/20
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 105ms/step - loss: 2.8853
Epoch 6/20
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 128ms/step - loss: 2.8038
Epoch 7/20
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 120ms/step - loss: 2.7452
Epoch 8/20
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 126ms/step - loss: 2.6947
Epoch 9/20
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 113ms/step - loss: 2.6236
Epoch 10/20
[1m231/231[0m [32m━━━