In [5]:
from gibberish import gen_gibberish
from char_indexing import get_char_indexing, convert_word_to_int, deconvert_integers_to_word
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding
from sklearn.model_selection import train_test_split

In [6]:
char_set = list(" abcdefghijklmnopqrstuvwxyz0123456789")
char2int, int2char = get_char_indexing(char_set)

In [7]:
# Dummy data (you would replace this with your own data)
REPETITION_NUMBER = 100
correct_words = ['apple', 'banana', 'orange', 'grape', 'pineapple'] * REPETITION_NUMBER
misspelled_words = [gen_gibberish(word, char_set) for word in correct_words]
misspelled_words

['apple',
 'banna',
 'orange',
 'grape',
 'pineapple',
 'apple',
 'banana',
 'orange',
 'grape',
 'pinatple',
 'apple',
 'bancana',
 'orfnge',
 'grape',
 'pineapplxe',
 'appl',
 'banana',
 'orlnge',
 'gape',
 'pineaprple',
 'apple',
 'banaena',
 'orange',
 'grpe',
 'pineapple',
 'apple',
 'banaaaa',
 'oracge',
 'grap6e',
 'piyapple',
 'apple',
 'banan',
 'oranpe',
 'grae',
 'pineaqple',
 'appe',
 'banana',
 'orange',
 'grpe',
 'pineaple',
 'apqle',
 'banana',
 'orangs',
 'tgrape',
 'pineapple',
 'appl5',
 'banana',
 'or6anrge',
 'g6rape',
 '9pintapple',
 'ap4ple',
 'banaga',
 'orangh',
 'grqape',
 'pineapre',
 'applq',
 'banana',
 'ormange',
 'grapae',
 'pij2eapple',
 'apple',
 'banana',
 'orahnge',
 'grape',
 'pin2ap9ple',
 'applf',
 'banarna',
 'orange',
 'grave',
 'pieapple',
 'applm',
 'b2anaa',
 'orange',
 'gape',
 'pineappie',
 'apple',
 'b1anana',
 'orangr',
 'gape',
 'pineapp e',
 'aple',
 'banana',
 'eorange',
 'grape',
 'pieapplue',
 'applle',
 'banana',
 'orange',
 'grape',


In [8]:
# Convert words to sequences of character indices
X = [convert_word_to_int(word, char_set, char2int) for word in misspelled_words]
y = [convert_word_to_int(word, char_set, char2int) for word in correct_words]

[[1, 16, 16, 12, 5],
 [2, 1, 14, 14, 1],
 [15, 18, 1, 14, 7, 5],
 [7, 18, 1, 16, 5],
 [16, 9, 14, 5, 1, 16, 16, 12, 5],
 [1, 16, 16, 12, 5],
 [2, 1, 14, 1, 14, 1],
 [15, 18, 1, 14, 7, 5],
 [7, 18, 1, 16, 5],
 [16, 9, 14, 1, 20, 16, 12, 5],
 [1, 16, 16, 12, 5],
 [2, 1, 14, 3, 1, 14, 1],
 [15, 18, 6, 14, 7, 5],
 [7, 18, 1, 16, 5],
 [16, 9, 14, 5, 1, 16, 16, 12, 24, 5],
 [1, 16, 16, 12],
 [2, 1, 14, 1, 14, 1],
 [15, 18, 12, 14, 7, 5],
 [7, 1, 16, 5],
 [16, 9, 14, 5, 1, 16, 18, 16, 12, 5],
 [1, 16, 16, 12, 5],
 [2, 1, 14, 1, 5, 14, 1],
 [15, 18, 1, 14, 7, 5],
 [7, 18, 16, 5],
 [16, 9, 14, 5, 1, 16, 16, 12, 5],
 [1, 16, 16, 12, 5],
 [2, 1, 14, 1, 1, 1, 1],
 [15, 18, 1, 3, 7, 5],
 [7, 18, 1, 16, 33, 5],
 [16, 9, 25, 1, 16, 16, 12, 5],
 [1, 16, 16, 12, 5],
 [2, 1, 14, 1, 14],
 [15, 18, 1, 14, 16, 5],
 [7, 18, 1, 5],
 [16, 9, 14, 5, 1, 17, 16, 12, 5],
 [1, 16, 16, 5],
 [2, 1, 14, 1, 14, 1],
 [15, 18, 1, 14, 7, 5],
 [7, 18, 16, 5],
 [16, 9, 14, 5, 1, 16, 12, 5],
 [1, 16, 17, 12, 5],
 [2, 1, 14,

In [9]:
# Pad sequences to the same length and convert to numpy arrays
max_len = max(max(len(word) for word in X), max(len(word) for word in y))
X_padded = np.array([word + [0] * (max_len - len(word)) for word in X])
y_padded = np.array([word + [0] * (max_len - len(word)) for word in y])
X_padded, y_padded

(array([[ 1, 16, 16, ...,  0,  0,  0],
        [ 2,  1, 14, ...,  0,  0,  0],
        [15, 18,  1, ...,  0,  0,  0],
        ...,
        [ 4, 15, 18, ...,  0,  0,  0],
        [ 7, 18,  1, ...,  0,  0,  0],
        [16,  9, 14, ...,  0,  0,  0]]),
 array([[ 1, 16, 16, ...,  0,  0,  0],
        [ 2,  1, 14, ...,  0,  0,  0],
        [15, 18,  1, ...,  0,  0,  0],
        ...,
        [15, 18,  1, ...,  0,  0,  0],
        [ 7, 18,  1, ...,  0,  0,  0],
        [16,  9, 14, ...,  5,  0,  0]]))

In [10]:
# Split up the dataset into training and test dataset
# X_train, X_test, y_train, y_test = train_test_split(X_padded, y_padded, test_size=0.2)

In [11]:
# Define the model
model = Sequential()
model.add(Embedding(len(char_set), 10, input_length=max_len))
model.add(LSTM(50, return_sequences=True))
model.add(Dense(len(char_set), activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_padded, y_padded, epochs=300, batch_size=32)





Epoch 1/300


Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch

<keras.src.callbacks.History at 0x1aad24a7d90>

In [16]:
def correct_spelling(word, max_len, model):
    word_indices  = [char2int[char] for char in word]
    padded_word_indices  = np.array([word_indices  + [0] * (max_len - len(word_indices))])
    predicted_indices = np.argmax(model.predict(padded_word_indices), axis=-1)
    predicted_chars = deconvert_integers_to_word(predicted_indices[0], int2char)
    corrected_word = ''.join(predicted_chars)
    return corrected_word

print(correct_spelling('orgggne', max_len, model))



orange     
