In [10]:
import numpy as np
from tensorflow.keras.utils import to_categorical

# Load and preprocess the data
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read().lower()
    chars = sorted(list(set(text)))
    char_to_idx = {ch: i for i, ch in enumerate(chars)}
    idx_to_char = {i: ch for i, ch in enumerate(chars)}
    return text, chars, char_to_idx, idx_to_char

def preprocess_data(text, chars, char_to_idx, seq_length):
    X_data, y_data = [], []
    for i in range(0, len(text) - seq_length, 1):
        seq_in = text[i:i + seq_length]
        seq_out = text[i + seq_length]
        X_data.append([char_to_idx[char] for char in seq_in])
        y_data.append(char_to_idx[seq_out])
    X = np.reshape(X_data, (len(X_data), seq_length, 1))
    X = X / float(len(chars))  # Normalize input
    y = to_categorical(y_data, num_classes=len(chars))
    return X, y

# Define file path and sequence length
file_path = 'file1.txt'
seq_length = 100  # Sequence length for training

# Load and preprocess the data
text, chars, char_to_idx, idx_to_char = load_data(file_path)
X, y = preprocess_data(text, chars, char_to_idx, seq_length)

print(f'Total characters: {len(text)}')
print(f'Total unique characters: {len(chars)}')
print(f'Total sequences: {len(X)}')


Total characters: 60
Total unique characters: 20
Total sequences: 0


In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Build the RNN model
def build_model(seq_length, vocab_size):
    model = Sequential([
        LSTM(256, input_shape=(seq_length, 1)),
        Dense(vocab_size, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

# Build the model
model = build_model(seq_length, len(chars))
model.summary()


In [15]:
print(f'Shape of X: {X.shape}')
print(f'Shape of y: {y.shape}')


Shape of X: (0, 100, 1)
Shape of y: (0, 20)


In [16]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')


In [18]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Example data (replace with your actual data loading and preprocessing)
X = np.random.randn(1000, 50, 1)  # Example input data
y = np.random.randint(0, 10, size=(1000,))  # Example target labels (integer-encoded)

# Step 1: Build the RNN model
model = Sequential([
    LSTM(256, input_shape=(50, 1)),
    Dense(10, activation='softmax')  # Assuming 10 classes
])

# Step 2: Compile the model with sparse categorical cross-entropy
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

# Step 3: Train the model
epochs = 20
batch_size = 128
history = model.fit(X, y, epochs=epochs, batch_size=batch_size)


Epoch 1/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 130ms/step - loss: 2.3043
Epoch 2/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 147ms/step - loss: 2.2961
Epoch 3/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 136ms/step - loss: 2.2848
Epoch 4/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 125ms/step - loss: 2.2919
Epoch 5/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 128ms/step - loss: 2.2877
Epoch 6/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 125ms/step - loss: 2.2884
Epoch 7/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 126ms/step - loss: 2.2920
Epoch 8/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 167ms/step - loss: 2.2854
Epoch 9/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 139ms/step - loss: 2.2897
Epoch 10/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 124ms/step - loss: 2.2814
Epoch 11/

In [20]:
char_to_idx = {ch: i for i, ch in enumerate(chars)}


In [23]:
chars = sorted(set("file1.txt"))  # Assuming 'text' is your entire dataset


In [28]:
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense

# Function to preprocess text and create char-to-index and index-to-char mappings
# Function to preprocess text and create char-to-index and index-to-char mappings
def preprocess_text(text):
    chars = sorted(list(set(text)))
    char_to_idx = {char: idx for idx, char in enumerate(chars)}
    
    # Add '<unknown>' with a specific index (e.g., using the maximum index + 1)
    unknown_index = len(chars)  # Assign the next index after the last character
    char_to_idx['<unknown>'] = unknown_index
    
    idx_to_char = {idx: char for char, idx in char_to_idx.items()}
    return char_to_idx, idx_to_char


# Function to generate sequences of input and output characters for training
def create_sequences(text, seq_length):
    sequences = []
    next_chars = []
    for i in range(len(text) - seq_length):
        seq = text[i:i + seq_length]
        next_char = text[i + seq_length]
        sequences.append(seq)
        next_chars.append(next_char)
    return sequences, next_chars

# Function to generate text using the trained model
def generate_text(model, start_seed, char_to_idx, idx_to_char, seq_length, num_chars):
    generated_text = start_seed
    seed = []
    for char in start_seed:
        if char in char_to_idx:
            seed.append(char_to_idx[char])
        else:
            # Handle unknown characters by assigning a default index
            seed.append(char_to_idx['<unknown>'])  # Use the default index for unknown characters
    
    unknown_index = char_to_idx['<unknown>']  # Get the default index for unknown characters

    for _ in range(num_chars):
        x_pred = np.reshape(seed, (1, len(seed), 1))
        prediction = model.predict(x_pred, verbose=0)
        idx = np.argmax(prediction)
        
        # Convert index back to character
        if idx in idx_to_char:
            next_char = idx_to_char[idx]
        else:
            next_char = '<unknown>'  # Fallback for unknown index
        
        generated_text += next_char
        seed.append(idx)
        seed = seed[1:]  # Move the window
        
    return generated_text

# Example text data
text_data = "the quick brown fox jumps over the lazy dog"
seq_length = 30  # Sequence length for LSTM
num_epochs = 20
batch_size = 128
num_chars_to_generate = 500

# Preprocess text and create char-to-index and index-to-char mappings
char_to_idx, idx_to_char = preprocess_text(text_data)

# Create sequences of input and output characters for training
sequences, next_chars = create_sequences(text_data, seq_length)

# Convert sequences and next_chars to numpy arrays
X = np.zeros((len(sequences), seq_length, 1), dtype=np.float32)
y = np.zeros((len(sequences), len(char_to_idx)), dtype=np.float32)
for i, seq in enumerate(sequences):
    for t, char in enumerate(seq):
        X[i, t, 0] = char_to_idx[char]
    y[i, char_to_idx[next_chars[i]]] = 1.0  # One-hot encode the output character

# Define the LSTM model
model = Sequential([
    LSTM(128, input_shape=(X.shape[1], X.shape[2])),
    Dense(len(char_to_idx), activation='softmax')
])

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Train the model
model.fit(X, y, epochs=num_epochs, batch_size=batch_size)

# Generate text using the trained model
start_seed = "the quick brown fox jumps over the lazy"
generated_text = generate_text(model, start_seed, char_to_idx, idx_to_char, seq_length, num_chars_to_generate)
print("Generated Text:")
print(generated_text)


Epoch 1/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - loss: 3.4659
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - loss: 3.3087
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - loss: 3.1681
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - loss: 3.0425
Epoch 5/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - loss: 2.9310
Epoch 6/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - loss: 2.8321
Epoch 7/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - loss: 2.7441
Epoch 8/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - loss: 2.6653
Epoch 9/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - loss: 2.5941
Epoch 10/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - loss: 2.5293
Epoch 11/20
[1m1/1[