In [None]:
from datasets import load_dataset

ds = load_dataset("agentlans/grammar-correction")

In [None]:
ds

In [None]:
df_train = ds['train']
df_validation = ds['validation']

In [None]:
df_train['output'][1]

In [None]:
train_in = []
train_val = []
for sent in df_train['input']:
  train_in.append(sent)
for sent in df_train['output']:
  train_val.append(sent)

test_in = []
test_val = []
for sent in df_validation['input']:
  test_in.append(sent)
for sent in df_validation['output']:
  test_val.append(sent)


In [None]:
test_in[1]

In [None]:
# This cell will be cleared as its logic was causing issues with tokenizer preparation.

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(
    filters="",
    lower=False,
    oov_token="<unk>"
)


In [None]:
import numpy as np
train_in = ["<start> " + sent + " <end>" for sent in train_in]
train_val  = ["<start> " + sent + " <end>" for sent in train_val]
test_in  = ["<start> " + sent + " <end>" for sent in test_in]
test_val  = ["<start> " + sent + " <end>" for sent in test_val]


In [None]:
# def token(sent,tokenizer):
#     sent = [s.lower() for s in sent]
#     tokenizer.fit_on_texts(sent)
#     return sent

# train_in = token(train_in,tokenizer)
# train_val = token(train_val,tokenizer)
# test_in = token(test_in,tokenizer)
# test_val = token(test_val,tokenizer)
tokenizer.fit_on_texts(train_in + train_val)

In [None]:
tokenizer.word_index

In [None]:
train_in[1]

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
stop_words = set(stopwords.words('english'))


In [None]:
def stop(data):
  clean = []
  for sent in data:
    word = sent.split()
    sent = [s for s in word if s.lower() not in stop_words]
    clean.append(" ".join(sent))
  return clean

train_in = stop(train_in)
train_val = stop(train_val)
test_in = stop(test_in)
test_val = stop(test_val)


In [None]:
vacab = len(tokenizer.word_index)

In [None]:
train_in = tokenizer.texts_to_sequences(train_in)
train_val = tokenizer.texts_to_sequences(train_val)
test_in = tokenizer.texts_to_sequences(test_in)
test_val = tokenizer.texts_to_sequences(test_val)

In [None]:
from keras.utils import pad_sequences
max_length = 256

train_in = pad_sequences(train_in , padding="post",maxlen=max_length)
train_val = pad_sequences(train_val , padding="post",maxlen=max_length)
test_in = pad_sequences(test_in , padding="post",maxlen=max_length)
test_val = pad_sequences(test_val , padding="post",maxlen=max_length)

In [None]:
print(f"train_in_shape : {train_in.shape}")
print(f"train_val_shape : {train_val.shape}")
print(f"test_in_shape : {test_in.shape}")
print(f"test_val_shape : {test_val.shape}")

In [None]:
from tensorflow.keras.layers import Input,Dense,Embedding,LSTM
max_len = 256

encoder_input = Input(shape=(None,))
encoder_emd = Embedding(vacab+1,128)(encoder_input)
encoder_lstm = LSTM(256,return_state=True)
_,h,c = encoder_lstm(encoder_emd)
encoder_state = [h,c]

decoder_input = Input(shape=(None,))
decoder_emd_layer = Embedding(vacab+1, 128)   # layer object
decoder_emd = decoder_emd_layer(decoder_input) # tensor for training

decoder_lstm = LSTM(256,return_sequences=True, return_state=True)
decoder_output, _, _ = decoder_lstm(decoder_emd, initial_state=encoder_state)
decoder_Dense = Dense(vacab+1 ,activation = "softmax")
decoder_out = decoder_Dense(decoder_output)

In [None]:

output = train_val

In [None]:
from tensorflow.keras.models import Model
model = Model([encoder_input,decoder_input],decoder_out)

In [None]:
model.compile(loss="sparse_categorical_crossentropy",optimizer='adam',metrics=['accuracy'])

In [None]:
history = model.fit(
    [train_in,train_val],
    output,
    epochs=1,
    batch_size=8
)

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
import numpy as np

# --- assume these exist from training ---
# encoder_input, decoder_input, decoder_lstm, decoder_Dense
# vacab = vocab size
# tokenizer

# --- create layer objects for inference ---
decoder_emd_layer = Embedding(vacab+1, 128)  # embedding layer object

# --- Encoder model ---
encoder_outputs, state_h, state_c = encoder_lstm(encoder_emd)
encoder_model = Model(encoder_input, [state_h, state_c])

# --- Decoder model ---
dec_input_in = Input(shape=(1,))
dec_state_h_in = Input(shape=(256,))
dec_state_c_in = Input(shape=(256,))

dec_emd_out = decoder_emd_layer(dec_input_in)  # call layer object
dec_out, state_h, state_c = decoder_lstm(dec_emd_out, initial_state=[dec_state_h_in, dec_state_c_in])
dec_out = decoder_Dense(dec_out)

decoder_model = Model([dec_input_in, dec_state_h_in, dec_state_c_in],
                      [dec_out, state_h, state_c])

# --- Inference loop ---
sentence = "he go to school yesterday"
enc_seq = np.array(tokenizer.texts_to_sequences([sentence]))

state_h, state_c = encoder_model.predict(enc_seq)

start_token = tokenizer.word_index.get("<start>", 1)
end_token   = tokenizer.word_index.get("<end>", 2)
dec_input = np.array([[start_token]])

decoded_tokens = []
max_len_out = 50

for _ in range(max_len_out):
    dec_out, state_h, state_c = decoder_model.predict([dec_input, state_h, state_c])
    next_token = np.argmax(dec_out[0, -1, :])
    if next_token == end_token:
        break
    decoded_tokens.append(next_token)
    dec_input = np.array([[next_token]])

idx2word = {v: k for k, v in tokenizer.word_index.items()}
corrected = " ".join(idx2word.get(t, "") for t in decoded_tokens if t != 0)

print("Input    :", sentence)
print("Corrected:", corrected)
