In [1]:
%tensorflow_version 1.x

TensorFlow 1.x selected.


In [2]:
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import pandas as pd
import numpy as np
import tensorflow as tf
import os
from copy import deepcopy
from os import listdir
from os.path import isfile, join
from collections import namedtuple
from tensorflow.python.layers.core import Dense
from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors
import time
import re
from sklearn.model_selection import train_test_split

In [4]:
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1-8EW1wlmn9n9ikbTNowqKoiVIhEJoeJP' -O spelling_errs.pickle

--2020-04-06 10:38:52--  https://docs.google.com/uc?export=download&id=1-8EW1wlmn9n9ikbTNowqKoiVIhEJoeJP
Resolving docs.google.com (docs.google.com)... 172.217.212.102, 172.217.212.100, 172.217.212.113, ...
Connecting to docs.google.com (docs.google.com)|172.217.212.102|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://doc-0g-54-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/a4jauibags48dm9g10m3tk6e9rd0bacs/1586169525000/07481003931234110333/*/1-8EW1wlmn9n9ikbTNowqKoiVIhEJoeJP?e=download [following]
--2020-04-06 10:38:53--  https://doc-0g-54-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/a4jauibags48dm9g10m3tk6e9rd0bacs/1586169525000/07481003931234110333/*/1-8EW1wlmn9n9ikbTNowqKoiVIhEJoeJP?e=download
Resolving doc-0g-54-docs.googleusercontent.com (doc-0g-54-docs.googleusercontent.com)... 209.85.146.132, 2607:f8b0:4001:c1f::84
Connecting to doc-0g-54-docs.googleusercontent.com (doc-0g-

In [0]:
spelling_errs = pd.read_pickle("spelling_errs.pickle")

In [0]:
test_frac = 0.1

spelling_errs = spelling_errs.sample(frac=1, random_state=42).reset_index(drop=True)
test_df = spelling_errs[:int(len(spelling_errs) * test_frac)]
spelling_errs = spelling_errs[int(len(spelling_errs) * test_frac):]

test_df.to_pickle("test.pickle")

In [0]:
errors = [word.lower() for word in list(spelling_errs["Ошибка"])]
corrections = [word.lower() for word in list(spelling_errs["Исправление"])]

In [8]:
for p in zip(errors[:10], corrections[:10]):
  print(p[0], p[1], sep="\t")

штейн	стайн
не   многие	немногие
енергии	энергии
беспокоит	беспокоить
развывающиеся	развивающиеся
не однородный	неоднородный
левой	левые
епигона	эпигона
элизавет	элизабет
науке против старения	науке против старения.


## Preparing the Data

In [0]:
def clean_text(text):
    '''Remove unwanted characters and extra spaces from the text'''
    return text.lower()

In [0]:
# Clean the text of the books
clean_books = []
for p in zip(errors, corrections):
  clean_books.append([clean_text(p[1]), clean_text(p[0])])

In [0]:
# Create a dictionary to convert the vocabulary (characters) to integers
vocab_to_int = {}
count = 0
for book in clean_books:
    for character in book[0]+book[1]:
        if character not in vocab_to_int:
            vocab_to_int[character] = count
            count += 1

# Add special tokens to vocab_to_int
codes = ['<PAD>','<EOS>','<GO>']
for code in codes:
    vocab_to_int[code] = count
    count += 1

In [12]:
# Check the size of vocabulary and all of the values
vocab_size = len(vocab_to_int)
print("The vocabulary contains {} characters.".format(vocab_size))
print(sorted(vocab_to_int))

The vocabulary contains 98 characters.
[' ', '!', '"', '#', '%', '&', "'", '(', ')', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '<EOS>', '<GO>', '<PAD>', '>', '?', 'a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\xa0', '«', '»', 'è', 'ë', '́', 'а', 'б', 'в', 'г', 'д', 'е', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я', 'ё', 'қ', '–', '—', '’', '№']


*Note: We could have made this project a little easier by using only lower case words and fewer special characters ($,&,-...), but I want to make this spell checker as useful as possible.*

In [0]:
# Create another dictionary to convert integers to their respective characters
int_to_vocab = {}
for character, value in vocab_to_int.items():
    int_to_vocab[value] = character

In [14]:
# Split the text from the books into sentences.
sentences = deepcopy(clean_books)
print("There are {} sentences.".format(len(sentences)))

There are 8470 sentences.


In [0]:
# Convert sentences to integers
int_sentences = []

for sentence in sentences:
    int_sentence = []
    for character in sentence[0]:
        int_sentence.append(vocab_to_int[character])
    int_sentences.append(int_sentence)

In [16]:
# Find the length of each sentence
lengths = []
for sentence in int_sentences:
    lengths.append(len(sentence))
lengths = pd.DataFrame(lengths, columns=["counts"])

# Limit the data we will use to train our model
max_length = int(lengths.describe().loc["75%"])+1
min_length = 2

good_sentences = []

for sentence in int_sentences:
    if len(sentence) <= max_length and len(sentence) >= min_length:
        good_sentences.append(sentence)

print("We will use {} to train and test our model.".format(len(good_sentences)))

We will use 7383 to train and test our model.


*Note: I decided to not use very long or short sentences because they are not as useful for training our model. Shorter sentences are less likely to include an error and the text is more likely to be repetitive. Longer sentences are more difficult to learn due to their length and increase the training time quite a bit. If you are interested in using this model for more than just a personal project, it would be worth using these longer sentence, and much more training data to create a more accurate model.*

In [17]:
# Split the data into training and testing sentences
training, testing = train_test_split(good_sentences, test_size = 0.15, random_state = 42)

print("Number of training sentences:", len(training))
print("Number of testing sentences:", len(testing))

Number of training sentences: 6275
Number of testing sentences: 1108


In [0]:
# Sort the sentences by length to reduce padding, which will allow the model to train faster
training_sorted = []
testing_sorted = []

for i in range(min_length, max_length+1):
    for sentence in training:
        if len(sentence) == i:
            training_sorted.append(sentence)
    for sentence in testing:
        if len(sentence) == i:
            testing_sorted.append(sentence)

In [19]:
# Check to ensure the sentences have been selected and sorted correctly
for i in range(5):
    print(training_sorted[i], len(training_sorted[i]))

[4, 8] 2
[10, 18] 2
[6, 41] 2
[4, 2] 2
[18, 2] 2


In [0]:
def decoding_seq(int_sequence):
  return "".join([int_to_vocab[i] for i in int_sequence])

In [0]:
inps = [p[0] for p in clean_books]
noise = [p[1] for p in clean_books]

training_noisy = []
for seq in training_sorted:
  training_noisy.append([vocab_to_int[c] for c in noise[inps.index(decoding_seq(seq))]])

testing_noisy = []
for seq in testing_sorted:
  testing_noisy.append([vocab_to_int[c] for c in noise[inps.index(decoding_seq(seq))]])

# Building the Model

In [0]:
def model_inputs():
    '''Create palceholders for inputs to the model'''
    
    with tf.name_scope('inputs'):
        inputs = tf.placeholder(tf.int32, [None, None], name='inputs')
    with tf.name_scope('targets'):
        targets = tf.placeholder(tf.int32, [None, None], name='targets')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    inputs_length = tf.placeholder(tf.int32, (None,), name='inputs_length')
    targets_length = tf.placeholder(tf.int32, (None,), name='targets_length')
    max_target_length = tf.reduce_max(targets_length, name='max_target_len')

    return inputs, targets, keep_prob, inputs_length, targets_length, max_target_length

In [0]:
def process_encoding_input(targets, vocab_to_int, batch_size):
    '''Remove the last word id from each batch and concat the <GO> to the begining of each batch'''
    
    with tf.name_scope("process_encoding"):
        ending = tf.strided_slice(targets, [0, 0], [batch_size, -1], [1, 1])
        dec_input = tf.concat([tf.fill([batch_size, 1], vocab_to_int['<GO>']), ending], 1)

    return dec_input

In [0]:
def encoding_layer(rnn_size, sequence_length, num_layers, rnn_inputs, keep_prob, direction):
    '''Create the encoding layer'''
    
    if direction == 1:
        with tf.name_scope("RNN_Encoder_Cell_1D"):
            for layer in range(num_layers):
                with tf.variable_scope('encoder_{}'.format(layer)):
                    lstm = tf.contrib.rnn.LSTMCell(rnn_size)

                    drop = tf.contrib.rnn.DropoutWrapper(lstm, 
                                                         input_keep_prob = keep_prob)

                    enc_output, enc_state = tf.nn.dynamic_rnn(drop, 
                                                              rnn_inputs,
                                                              sequence_length,
                                                              dtype=tf.float32)

            return enc_output, enc_state
        
        
    if direction == 2:
        with tf.name_scope("RNN_Encoder_Cell_2D"):
            for layer in range(num_layers):
                with tf.variable_scope('encoder_{}'.format(layer)):
                    cell_fw = tf.contrib.rnn.LSTMCell(rnn_size)
                    cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw, 
                                                            input_keep_prob = keep_prob)

                    cell_bw = tf.contrib.rnn.LSTMCell(rnn_size)
                    cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw, 
                                                            input_keep_prob = keep_prob)

                    enc_output, enc_state = tf.nn.bidirectional_dynamic_rnn(cell_fw, 
                                                                            cell_bw, 
                                                                            rnn_inputs,
                                                                            sequence_length,
                                                                            dtype=tf.float32)
            # Join outputs since we are using a bidirectional RNN
            enc_output = tf.concat(enc_output,2)
            # Use only the forward state because the model can't use both states at once
            return enc_output, enc_state[0]

In [0]:
def training_decoding_layer(dec_embed_input, targets_length, dec_cell, initial_state, output_layer, 
                            vocab_size, max_target_length):
    '''Create the training logits'''
    
    with tf.name_scope("Training_Decoder"):
        training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_embed_input,
                                                            sequence_length=targets_length,
                                                            time_major=False)

        training_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,
                                                           training_helper,
                                                           initial_state,
                                                           output_layer) 

        training_logits, _ ,_ = tf.contrib.seq2seq.dynamic_decode(training_decoder,
                                                                    output_time_major=False,
                                                                    impute_finished=True,
                                                                    maximum_iterations=max_target_length)
        return training_logits

In [0]:
def inference_decoding_layer(embeddings, start_token, end_token, dec_cell, initial_state, output_layer,
                             max_target_length, batch_size):
    '''Create the inference logits'''
    
    with tf.name_scope("Inference_Decoder"):
        start_tokens = tf.tile(tf.constant([start_token], dtype=tf.int32), [batch_size], name='start_tokens')

        inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embeddings,
                                                                    start_tokens,
                                                                    end_token)

        inference_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,
                                                            inference_helper,
                                                            initial_state,
                                                            output_layer)

        inference_logits, _ ,_ = tf.contrib.seq2seq.dynamic_decode(inference_decoder,
                                                                    output_time_major=False,
                                                                    impute_finished=True,
                                                                    maximum_iterations=max_target_length)

        return inference_logits

In [0]:
def decoding_layer(dec_embed_input, embeddings, enc_output, enc_state, vocab_size, inputs_length, targets_length, 
                   max_target_length, rnn_size, vocab_to_int, keep_prob, batch_size, num_layers, direction):
    '''Create the decoding cell and attention for the training and inference decoding layers'''
    
    with tf.name_scope("RNN_Decoder_Cell"):
        for layer in range(num_layers):
            with tf.variable_scope('decoder_{}'.format(layer)):
                lstm = tf.contrib.rnn.LSTMCell(rnn_size)
                dec_cell = tf.contrib.rnn.DropoutWrapper(lstm, 
                                                         input_keep_prob = keep_prob)
    
    output_layer = Dense(vocab_size,
                         kernel_initializer = tf.truncated_normal_initializer(mean = 0.0, stddev=0.1))
    
    attn_mech = tf.contrib.seq2seq.BahdanauAttention(rnn_size,
                                                  enc_output,
                                                  inputs_length,
                                                  normalize=False,
                                                  name='BahdanauAttention')
    
    with tf.name_scope("Attention_Wrapper"):
        dec_cell = tf.contrib.seq2seq.AttentionWrapper(dec_cell,
                                                      attn_mech,
                                                      rnn_size)
    
    initial_state = dec_cell.zero_state(batch_size=batch_size,dtype=tf.float32).clone(cell_state=enc_state)

    with tf.variable_scope("decode"):
        training_logits = training_decoding_layer(dec_embed_input, 
                                                  targets_length, 
                                                  dec_cell, 
                                                  initial_state,
                                                  output_layer,
                                                  vocab_size, 
                                                  max_target_length)
    with tf.variable_scope("decode", reuse=True):
        inference_logits = inference_decoding_layer(embeddings,  
                                                    vocab_to_int['<GO>'], 
                                                    vocab_to_int['<EOS>'],
                                                    dec_cell, 
                                                    initial_state, 
                                                    output_layer,
                                                    max_target_length,
                                                    batch_size)

    return training_logits, inference_logits

In [0]:
def seq2seq_model(inputs, targets, keep_prob, inputs_length, targets_length, max_target_length, 
                  vocab_size, rnn_size, num_layers, vocab_to_int, batch_size, embedding_size, direction):
    '''Use the previous functions to create the training and inference logits'''
    
    enc_embeddings = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1, 1))
    enc_embed_input = tf.nn.embedding_lookup(enc_embeddings, inputs)
    enc_output, enc_state = encoding_layer(rnn_size, inputs_length, num_layers, 
                                           enc_embed_input, keep_prob, direction)
    
    dec_embeddings = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1, 1))
    dec_input = process_encoding_input(targets, vocab_to_int, batch_size)
    dec_embed_input = tf.nn.embedding_lookup(dec_embeddings, dec_input)
    
    training_logits, inference_logits  = decoding_layer(dec_embed_input, 
                                                        dec_embeddings,
                                                        enc_output,
                                                        enc_state, 
                                                        vocab_size, 
                                                        inputs_length, 
                                                        targets_length, 
                                                        max_target_length,
                                                        rnn_size, 
                                                        vocab_to_int, 
                                                        keep_prob, 
                                                        batch_size,
                                                        num_layers,
                                                        direction)
    
    return training_logits, inference_logits

In [0]:
def pad_sentence_batch(sentence_batch):
    """Pad sentences with <PAD> so that each sentence of a batch has the same length"""
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [vocab_to_int['<PAD>']] * (max_sentence - len(sentence)) for sentence in sentence_batch]

In [0]:
# The default parameters
epochs = 100
batch_size = 128
num_layers = 2
rnn_size = 512
embedding_size = 128
learning_rate = 0.0005
direction = 2
threshold = 0.95
keep_probability = 0.75

In [0]:
def build_graph(keep_prob, rnn_size, num_layers, batch_size, learning_rate, embedding_size, direction):

    tf.reset_default_graph()
    
    # Load the model inputs    
    inputs, targets, keep_prob, inputs_length, targets_length, max_target_length = model_inputs()

    # Create the training and inference logits
    training_logits, inference_logits = seq2seq_model(tf.reverse(inputs, [-1]),
                                                      targets, 
                                                      keep_prob,   
                                                      inputs_length,
                                                      targets_length,
                                                      max_target_length,
                                                      len(vocab_to_int)+1,
                                                      rnn_size, 
                                                      num_layers, 
                                                      vocab_to_int,
                                                      batch_size,
                                                      embedding_size,
                                                      direction)

    # Create tensors for the training logits and inference logits
    training_logits = tf.identity(training_logits.rnn_output, 'logits')

    with tf.name_scope('predictions'):
        predictions = tf.identity(inference_logits.sample_id, name='predictions')
        tf.summary.histogram('predictions', predictions)

    # Create the weights for sequence_loss
    masks = tf.sequence_mask(targets_length, max_target_length, dtype=tf.float32, name='masks')
    
    with tf.name_scope("cost"):
        # Loss function
        cost = tf.contrib.seq2seq.sequence_loss(training_logits, 
                                                targets, 
                                                masks)
        tf.summary.scalar('cost', cost)

    with tf.name_scope("optimze"):
        optimizer = tf.train.AdamOptimizer(learning_rate)

        # Gradient Clipping
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)

    # Merge all of the summaries
    merged = tf.summary.merge_all()    

    # Export the nodes 
    export_nodes = ['inputs', 'targets', 'keep_prob', 'cost', 'inputs_length', 'targets_length',
                    'predictions', 'merged', 'train_op','optimizer']
    Graph = namedtuple('Graph', export_nodes)
    local_dict = locals()
    graph = Graph(*[local_dict[each] for each in export_nodes])

    return graph

## Fixing Custom Sentences

In [0]:
def text_to_ints(text, stoi):
    '''Prepare the text for the model'''
    
    text = text.lower()
    ret_ints = []
    for char in text:
      try:
        ret_ints.append(stoi[char])
      except KeyError:
        ret_ints.append(stoi['<PAD>'])
    return ret_ints

In [0]:
def get_batches(sentences, batch_size):
    """Batch sentences, noisy sentences, and the lengths of their sentences together.
       With each epoch, sentences will receive new mistakes"""
    
    for batch_i in range(0, len(sentences)//batch_size):
        start_i = batch_i * batch_size
        sentences_batch = sentences[start_i:start_i + batch_size]
            
        sentences_batch_eos = []
        for sentence in sentences_batch:
            sentence.append(vocab_to_int['<EOS>'])
            sentences_batch_eos.append(sentence)
            
        pad_sentences_batch = np.array(pad_sentence_batch(sentences_batch_eos))
        
        # Need the lengths for the _lengths parameters
        pad_sentences_lengths = []
        for sentence in pad_sentences_batch:
            pad_sentences_lengths.append(len(sentence))
        
        yield pad_sentences_batch, pad_sentences_lengths

In [34]:
%%time

leipzig_df = pd.read_pickle("/content/gdrive/My Drive/RLC2/leipzig_df.pickle")
selected_words = list(leipzig_df.loc[leipzig_df["error"]]["word"])
del leipzig_df

CPU times: user 16.6 s, sys: 4.37 s, total: 21 s
Wall time: 21.2 s


In [35]:
len(selected_words)

7875005

In [36]:
%%time

inference_sents = [text_to_ints(word, vocab_to_int) for word in selected_words]

CPU times: user 23 s, sys: 238 ms, total: 23.2 s
Wall time: 23.2 s


In [0]:
del selected_words

In [0]:
checkpoint = "/content/gdrive/My Drive/RLC/best_obfuscator2/best_obfuscator.ckpt"

model = build_graph(keep_probability, rnn_size, num_layers, batch_size, learning_rate, embedding_size, direction) 

In [42]:
%%time

results = []

with tf.Session() as sess:
    # Load saved model
    saver = tf.train.Saver()
    saver.restore(sess, checkpoint)
    
    for input_batch, input_length in get_batches(inference_sents, batch_size):
        answer_logits = sess.run(model.predictions, {model.inputs: input_batch, 
                                                    model.inputs_length: input_length,
                                                    model.targets_length: [max(input_length)+1], 
                                                    model.keep_prob: [1.0]})
        results += list(answer_logits)

INFO:tensorflow:Restoring parameters from /content/gdrive/My Drive/RLC/best_obfuscator2/best_obfuscator.ckpt
CPU times: user 2h 2min 9s, sys: 21min 3s, total: 2h 23min 12s
Wall time: 1h 9min 53s


In [43]:
%%time

#del inference_sents

results_decoded = ["".join([int_to_vocab[int(i)] for i in answer]) for answer in results]
results_decoded = [res[:res.find("<EOS>")] for res in results_decoded]

CPU times: user 45.6 s, sys: 0 ns, total: 45.6 s
Wall time: 45.6 s


In [44]:
results_decoded[:10]

['архив',
 'ряда',
 'контролью',
 'средства',
 'по',
 'секторое',
 'улицы',
 'деятельностия',
 'км',
 'через']

In [45]:
results_decoded[110:120]

['наша',
 'отмечает',
 'государственный',
 'или',
 'принятия',
 'декларации',
 'являлься',
 'мет',
 'предуссмотрен',
 'походы']

In [0]:
import pickle

with open("res2.pickle", "wb") as outp:
  pickle.dump(results_decoded, outp)

!cp res2.pickle /content/gdrive/My\ Drive/RLC2

In [0]:
import pickle

with open("/content/gdrive/My Drive/RLC2/res2.pickle", "rb") as inp:
  results_decoded = pickle.load(inp)

In [0]:
#del results

leipzig_df = pd.read_pickle("/content/gdrive/My Drive/RLC2/leipzig_df_we.pickle")

In [49]:
rd = iter(results_decoded)
errs = list(leipzig_df["error"])

found_t = 0
for i, e in enumerate(errs):
  if e:
    found_t += 1
  if found_t >= len(results_decoded):
    errs[i] = False

errors1 = [next(rd) if e else None for e in errs]
leipzig_df["errmodel2"] = errors1

leipzig_df.sample(20)

Unnamed: 0,sent_id,word,whitespaces_follow,error,nones,errmodel1,errmodel2
38604266,wikipedia_227347,лет,1,False,False,,
23210230,webpublic_327690,",",1,False,False,,
43439554,wikipedia_492668,без,1,False,True,,
29114858,webpublic_680648,1943,0,False,False,,
13302018,newscrawl_750951,Проголосовать,1,False,False,,
7403736,newscrawl_414570,рыбе,1,False,False,,
37289882,wikipedia_157187,Опыт,1,False,True,,
6073253,newscrawl_339833,состоянию,1,False,False,,
12449306,newscrawl_703016,планирует,1,False,False,,
38934447,wikipedia_245424,до,1,False,False,,


In [50]:
leipzig_df.sample(20)

Unnamed: 0,sent_id,word,whitespaces_follow,error,nones,errmodel1,errmodel2
36961811,wikipedia_139018,созданию,1,False,False,,
3328729,newscrawl_185230,%,1,False,False,,
13787417,newscrawl_779590,справляется,0,False,False,,
2194404,newscrawl_123113,проведения,1,True,False,проведия,проведения
10700746,newscrawl_606281,два,1,False,False,,
3108519,newscrawl_173616,В,1,False,False,,
14171995,newscrawl_800922,заместителем,1,True,False,заметилем,заместителем
47747262,wikipedia_733234,нас,1,False,False,,
32343153,webpublic_870315,примерно,1,True,False,примено,примерно
35999244,wikipedia_85760,В,1,False,False,,


In [0]:
leipzig_df.to_pickle("leipzig_df_we.pickle")
!cp leipzig_df_we.pickle /content/gdrive/My\ Drive/RLC2

Examples of corrected sentences:
- Spellin is difficult, whch is wyh you need to study everyday.
- Spelling is difficult, which is why you need to study everyday.


- The first days of her existence in th country were vrey hard for Dolly. 
- The first days of her existence in the country were very hard for Dolly.


- Thi is really something impressiv thaat we should look into right away! 
- This is really something impressive that we should look into right away!

## Summary

I hope that you have found this project to be rather interesting and useful. The example sentences that I have presented above were specifically chosen, and the model will not always be able to make corrections of this quality. Given the amount of data that we are working with, this model still struggles. For it to be more useful, it would require far more training data, and additional parameter tuning. This parameter values that I have above worked best for me, but I expect there are even better values that I was not able to find.

Thanks for reading!