In [None]:
!pip install "tensorflow-text==2.8.*"

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import itertools
import tqdm
import pickle
import random
import torch
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
import tensorflow_text as tf_text
import unicodedata

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**data**

In [None]:
def check_valid_sentence():
    intab_l = "ạảãàáâậầấẩẫăắằặẳẵóòọõỏôộổỗồốơờớợởỡéèẻẹẽêếềệểễúùụủũưựữửừứíìịỉĩýỳỷỵỹđ"
    ascii_lowercase = 'abcdefghijklmnopqrstuvwxyz'
    digits = '0123456789'
    punctuation = r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""
    whitespace = ' '

    accept_strings =  intab_l + ascii_lowercase + digits + punctuation + whitespace
    r = re.compile('^[' + accept_strings + ']+$')

    return r

In [None]:
data_path = "/content/drive/MyDrive/NLP_Task/CorrectSpellingTask/train_tieng_viet.txt"

with open(data_path) as f:
    train = f.readlines()

**eliminate sign for dataset**

In [None]:
intab_l = "ạảãàáâậầấẩẫăắằặẳẵóòọõỏôộổỗồốơờớợởỡéèẻẹẽêếềệểễúùụủũưựữửừứíìịỉĩýỳỷỵỹđ"
intab_u = "ẠẢÃÀÁÂẬẦẤẨẪĂẮẰẶẲẴÓÒỌÕỎÔỘỔỖỒỐƠỜỚỢỞỠÉÈẺẸẼÊẾỀỆỂỄÚÙỤỦŨƯỰỮỬỪỨÍÌỊỈĨÝỲỶỴỸĐ"
intab = list(intab_l+intab_u)

outtab_l = "a"*17 + "o"*17 + "e"*11 + "u"*11 + "i"*5 + "y"*5 + "d"
outtab_u = "A"*17 + "O"*17 + "E"*11 + "U"*11 + "I"*5 + "Y"*5 + "D"
outtab = list(outtab_l + outtab_u)

r = re.compile("|".join(intab))
replaces_dict = dict(zip(intab, outtab))

In [None]:
#bỏ dấu câu
def remove_sign(sen):
    sen_new = r.sub(lambda m: replaces_dict[m.group(0)], sen)
    return sen_new

#tách dấu phẩy, chấm
def normalizeString(s):
    # Tách dấu câu nếu kí tự liền nhau
    marks = '[.!?,-${}()]'
    r = "(["+"\\".join(marks)+"])"
    s = re.sub(r, r" \1 ", s)
    # Thay thế nhiều spaces bằng 1 space.
    s = re.sub(r"\s+", r" ", s).strip()
    return s

**Split Dataset**

In [None]:
train_idx_180k = []
train_opt_180k = []
train_ipt_180k = []
val_idx_10k = []
val_opt_10k = []
val_ipt_10k = []
test_idx_10k = []
test_opt_10k = []
test_ipt_10k = []

for i in range(200000):
    [idx, sen] = train[i].split('\t')
    sen = sen[:-1]
    try:
        non_sign_sen = normalizeString(remove_sign(sen))
    except:
        continue
    sen = normalizeString(sen)
    if i < 180000:
        train_idx_180k.append(idx)
        train_opt_180k.append(sen)
        train_ipt_180k.append(non_sign_sen)
    elif i < 190000:
        val_idx_10k.append(idx)
        val_opt_10k.append(sen)
        val_ipt_10k.append(non_sign_sen)
    elif i < 200000:
        test_idx_10k.append(idx)
        test_opt_10k.append(sen)
        test_ipt_10k.append(non_sign_sen)

In [None]:
def _save_pickle(filename, obj):
  with open(filename, 'wb') as f:
    pickle.dump(obj, f)

_save_pickle('train_tv_idx_180k.pkl', train_idx_180k)
_save_pickle('val_tv_idx_10k.pkl', val_idx_10k)
_save_pickle('test_tv_idx_10k.pkl', test_idx_10k)

**Normalized Data**

In [None]:
def _ngram(text, length = 4):
    words = text.split()
    grams = []
    if len(words) <= length:
      words = words + ["PAD"]*(length-len(words))
      return [' '.join(words)]
    else:
      for i in range(len(words)-length+1):
        grams.append(' '.join(words[i:(i+length)]))
      return grams

train_grams = list(itertools.chain.from_iterable([_ngram(item) for item in train_opt_180k]))
train_rev_acc_grams = list(itertools.chain.from_iterable([_ngram(item) for item in train_ipt_180k]))

val_grams = list(itertools.chain.from_iterable([_ngram(item) for item in val_opt_10k]))
val_rev_acc_grams = list(itertools.chain.from_iterable([_ngram(item) for item in val_ipt_10k]))

test_grams = list(itertools.chain.from_iterable([_ngram(item) for item in test_opt_10k]))
test_rev_acc_grams = list(itertools.chain.from_iterable([_ngram(item) for item in test_ipt_10k]))

corpus_train = list(zip(train_rev_acc_grams, train_grams))
corpus_val = list(zip(val_rev_acc_grams, val_grams))
corpus_test = list(zip(test_rev_acc_grams, test_grams))

**Create voc cabulary**

In [None]:
def _load_pickle(filename):
    pickle_in = open(filename,"rb")
    dict_ = pickle.load(pickle_in)

    return dict_

def _save_pickle(filename, obj):
  with open(filename, 'wb') as f:
    pickle.dump(obj, f)

In [None]:
data_save = "/content/drive/MyDrive/NLP_Task/CorrectSpellingTask/"

_save_pickle(data_save + 'corpus_train.pkl', corpus_train)
_save_pickle(data_save + 'corpus_val.pkl', corpus_val)
_save_pickle(data_save + 'corpus_test.pkl', corpus_test)

In [None]:
data_save = "/content/drive/MyDrive/NLP_Task/CorrectSpellingTask/"
 
corpus_train = _load_pickle(data_save + 'corpus_train.pkl')
corpus_val = _load_pickle(data_save + 'corpus_val.pkl')
corpus_test = _load_pickle(data_save + 'corpus_test.pkl')

In [None]:
inp = [ele[0] for ele in corpus_train[:100000]]
tar = [ele[1] for ele in corpus_train[:100000]] 

In [None]:
BUFFER_SIZE = len(corpus_train)
BATCH_SIZE = 64

dataset = tf.data.Dataset.from_tensor_slices((inp, tar)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)

In [None]:
PAD_token = 0  # Used for padding short sentences
SOS_token = 1  # Start-of-sentence token
EOS_token = 2  # End-of-sentence token

def preprocess(text):
    text = tf_text.normalize_utf8(text, 'NFKD')
    text = tf.strings.lower(text)
    print(text)
    text = unicodeToAscii(text)
    text = tf.strings.join(['[SOS]', text, '[EOS]'], separator=' ')
  
    return text