### CHAPTER 1

Load dataset

In [23]:
from datasets import load_dataset

dataset = load_dataset("imdb")

train_dataset = dataset["train"].shuffle(seed=42).select(range(1000))
test_dataset = dataset["test"].shuffle(seed=42).select(range(500))

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


#### Using BPE

In [24]:
from transformers import AutoTokenizer

bpe_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_bpe(batch):
  return bpe_tokenizer(batch['text'], padding='max_length', truncation=True)

train_bpe = train_dataset.map(tokenize_bpe, batched=True, remove_columns=["text"])
test_bpe = test_dataset.map(tokenize_bpe, batched=True, remove_columns=["text"])

train_bpe.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_bpe.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


[A
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [00:00<00:00, 2515.93 examples/s]

[A
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [00:00<00:00, 3650.41 examples/s]


In [29]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Train the BERT model with the BPE tokenization
model_bpe = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

def compute_metrics(pred):
	labels = pred.label_ids
	preds = pred.predictions.argmax(-1)
	precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
	acc = accuracy_score(labels, preds)
	return {
		'accuracy': acc,
		'f1': f1,
		'precision': precision,
		'recall': recall
	}

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
training_args = TrainingArguments(
	output_dir='./results_bpe',
	num_train_epochs=3,
	per_device_train_batch_size=8,
	per_device_eval_batch_size=8,
	evaluation_strategy='epoch',
	logging_dir='./logs_bpe',
  learning_rate=2e-5,
)

trainer_bpe = Trainer(
	model=model_bpe,
	args=training_args,
	train_dataset=train_bpe,
	eval_dataset=test_bpe,
  compute_metrics=compute_metrics
)

trainer_bpe.train()

  0%|          | 1/375 [27:17<170:10:04, 1637.98s/it]
                                                 
 33%|‚ñà‚ñà‚ñà‚ñé      | 125/375 [05:06<09:24,  2.26s/it]

{'eval_loss': 0.293795645236969, 'eval_accuracy': 0.896, 'eval_f1': 0.8893617021276595, 'eval_precision': 0.9330357142857143, 'eval_recall': 0.8495934959349594, 'eval_runtime': 44.2889, 'eval_samples_per_second': 11.289, 'eval_steps_per_second': 1.422, 'epoch': 1.0}


                                                   
 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 250/375 [10:54<05:19,  2.55s/it]

{'eval_loss': 0.3471946716308594, 'eval_accuracy': 0.884, 'eval_f1': 0.8905660377358491, 'eval_precision': 0.8309859154929577, 'eval_recall': 0.959349593495935, 'eval_runtime': 41.1385, 'eval_samples_per_second': 12.154, 'eval_steps_per_second': 1.531, 'epoch': 2.0}


                                                 
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 375/375 [16:11<00:00,  2.59s/it]

{'eval_loss': 0.3431745767593384, 'eval_accuracy': 0.906, 'eval_f1': 0.906187624750499, 'eval_precision': 0.8901960784313725, 'eval_recall': 0.9227642276422764, 'eval_runtime': 40.0338, 'eval_samples_per_second': 12.489, 'eval_steps_per_second': 1.574, 'epoch': 3.0}
{'train_runtime': 971.1565, 'train_samples_per_second': 3.089, 'train_steps_per_second': 0.386, 'train_loss': 0.31103653971354167, 'epoch': 3.0}





TrainOutput(global_step=375, training_loss=0.31103653971354167, metrics={'train_runtime': 971.1565, 'train_samples_per_second': 3.089, 'train_steps_per_second': 0.386, 'total_flos': 789333166080000.0, 'train_loss': 0.31103653971354167, 'epoch': 3.0})

In [32]:
# Evaluate BERT model
eval_results_bpe = trainer_bpe.evaluate()
print(f"BERT with BPE Evaluation Results: {eval_results_bpe}")
print(f"Accuracy: {eval_results_bpe['eval_accuracy']:.4f}")
print(f"Precision: {eval_results_bpe['eval_precision']:.4f}")
print(f"Recall: {eval_results_bpe['eval_recall']:.4f}")
print(f"F1 Score: {eval_results_bpe['eval_f1']:.4f}")

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:39<00:00,  1.59it/s]

BERT with BPE Evaluation Results: {'eval_loss': 0.3431745767593384, 'eval_accuracy': 0.906, 'eval_f1': 0.906187624750499, 'eval_precision': 0.8901960784313725, 'eval_recall': 0.9227642276422764, 'eval_runtime': 40.5268, 'eval_samples_per_second': 12.338, 'eval_steps_per_second': 1.555, 'epoch': 3.0}
Accuracy: 0.9060
Precision: 0.8902
Recall: 0.9228
F1 Score: 0.9062





#### LSTM not using BPE

In [33]:
from sklearn.feature_extraction.text import CountVectorizer
import torch

def tokenizer(texts, vectorizer=None):
	if vectorizer is None:
		vectorizer = CountVectorizer(max_features=1000, token_pattern=r'\b\w+\b')
		vectors = vectorizer.fit_transform(texts).toarray()
	else:
		vectors = vectorizer.transform(texts).toarray()
	return torch.tensor(vectors, dtype=torch.long), vectorizer

train_texts = train_dataset['text']
train_labels = torch.tensor(train_dataset['label'], dtype=torch.long)

test_texts = test_dataset['text']
test_labels = torch.tensor(test_dataset['label'], dtype=torch.long)

# Convert texts to token ids
train_simple, vectorizer = tokenizer(train_texts)
test_simple, _ = tokenizer(test_texts, vectorizer)

# Add a dimension for the channel
train_simple = train_simple.unsqueeze(1)
test_simple = test_simple.unsqueeze(1)

In [34]:
import torch.nn as nn
import torch.optim as optim

class SimpleLSTM(nn.Module):
	def __init__(self, input_dim, hidden_dim, output_dim):
		super(SimpleLSTM, self).__init__()
		self.hidden_dim = hidden_dim
		self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
		self.fc = nn.Linear(hidden_dim, output_dim)

	def forward(self, x):
		lstm_out, _ = self.lstm(x)
		lstm_out = lstm_out[:, -1, :] # get the last output
		out = self.fc(lstm_out)
		return out

input_dim = train_simple.shape[2]
hidden_dim = 64
output_dim = 2
model_simple = SimpleLSTM(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_simple.parameters(), lr=0.001)

In [35]:
from torch.utils.data import DataLoader, TensorDataset

# Dataloader
train_dataset_simple = TensorDataset(train_simple.float(), train_labels)
train_loader = DataLoader(train_dataset_simple, batch_size=8, shuffle=True)

test_dataset_simple = TensorDataset(test_simple.float(), test_labels)
test_loader = DataLoader(test_dataset_simple, batch_size=8, shuffle=False)

In [36]:
# Training loop for LSTM
num_epochs = 3
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_simple.to(device)

for epoch in range(num_epochs):
	model_simple.train()
	for inputs, labels in train_loader:
		inputs, labels = inputs.to(device), labels.to(device)
		optimizer.zero_grad()
		outputs = model_simple(inputs.float())
		loss = criterion(outputs, labels)
		loss.backward()
		optimizer.step()

	# Evaluation
	model_simple.eval()
	correct = 0
	total = 0
	with torch.no_grad():
		for inputs, labels in test_loader:
			inputs, labels = inputs.to(device), labels.to(device)
			outputs = model_simple(inputs.float())
			_, predicted = torch.max(outputs.data, 1)
			total += labels.size(0)
			correct += (predicted == labels).sum().item()

	accuracy = 100 * correct / total
	print(f'Epoch {epoch+1}, Accuracy: {accuracy:.2f}%')

Epoch 1, Accuracy: 72.20%
Epoch 2, Accuracy: 79.60%
Epoch 3, Accuracy: 80.60%


In [37]:
def evaluate_lstm(model, test_loader):
	model.eval()
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	model.to(device)

	all_preds = []
	all_labels = []
	with torch.no_grad():
		for inputs, labels in test_loader:
			inputs, labels = inputs.to(device), labels.to(device)
			outputs = model(inputs.float())
			_, predicted = torch.max(outputs.data, 1)
			all_preds.extend(predicted.cpu().numpy())
			all_labels.extend(labels.cpu().numpy())

	accuracy = accuracy_score(all_labels, all_preds)
	precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
	return {
		'accuracy': accuracy,
		'precision': precision,
		'recall': recall,
		'f1': f1
	}

eval_results_lstm = evaluate_lstm(model_simple, test_loader)
print(f"Accuracy: {eval_results_lstm['accuracy']:.4f}")
print(f"Precision: {eval_results_lstm['precision']:.4f}")
print(f"Recall: {eval_results_lstm['recall']:.4f}")
print(f"F1 Score: {eval_results_lstm['f1']:.4f}")

Accuracy: 0.8060
Precision: 0.8112
Recall: 0.8060
F1 Score: 0.8054


### CHAPTER 2

#### Generate training data


In [38]:
# Read text file
file_path_1 = 'Final_Project/giai-tri/url_007.txt'
file_path_2 = 'Final_Project/giai-tri/url_006.txt'
file_path_3 = 'Final_Project/giai-tri/url_002.txt'
file_path_4 = 'Final_Project/giai-tri/url_001.txt'
file_path_5 = 'Final_Project/giai-tri/url_352.txt'
file_path_6 = 'Final_Project/giai-tri/url_361.txt'

def read_text_file(file_path):
	with open(file_path, 'r', encoding='utf-8') as file:
		text = file.read()
	return text

original_data = read_text_file(file_path_1) + '. ' + read_text_file(file_path_2) + '. ' + read_text_file(file_path_3) + '. ' + read_text_file(file_path_4) + '. ' + read_text_file(file_path_5) + '. ' + read_text_file(file_path_6)

#Remove all newlines
original_data = original_data.replace('\n', '')
print(original_data)

Ki·ªát t√°c h∆°n 140 tri·ªáu USD c·ªßa T·ªÅ B·∫°ch Th·∫°ch "Th·∫≠p nh·ªã phong c·∫£nh ƒë·ªì" c·ªßa T·ªÅ B·∫°ch Th·∫°ch t·ª´ng b√°n gi√° 140,8 tri·ªáu USD - ƒë·∫Øt nh·∫•t Trung Qu·ªëc.Thevalue c√¥ng b·ªë danh s√°ch "10 t√°c ph·∫©m ngh·ªá thu·∫≠t ƒë·∫Øt nh·∫•t ƒë∆∞·ª£c b√°n ƒë·∫•u gi√°" h·ªìi cu·ªëi th√°ng 11, sau khi c·∫≠p nh·∫≠t nh·ªØng t√°c ph·∫©m ƒëo·∫°t gi√° cao trong nƒÉm, Th·∫≠p nh·ªã phong c·∫£nh ƒë·ªì c·ªßa T·ªÅ B·∫°ch Th·∫°ch ƒë·ª©ng th·ª© t√°m v·ªõi m·ª©c 931,5 tri·ªáu nh√¢n d√¢n t·ªá (kho·∫£ng 140,8 tri·ªáu USD).Con s·ªë n√†y ƒë∆∞·ª£c ·∫•n ƒë·ªãnh trong phi√™n ƒë·∫•u c·ªßa Poly B·∫Øc Kinh h·ªìi th√°ng 12/2017. T√°c ph·∫©m c√≥ m·ª©c gi√° kh·ªüi ƒëi·ªÉm l√† 450 tri·ªáu NDT, sau h∆°n 20 ph√∫t v·ªõi h∆°n 60 l∆∞·ª£t ƒë·∫∑t gi√°, t√°c ph·∫©m ƒë∆∞·ª£c ch·ªët ·ªü m·ª©c 931,5 tri·ªáu NDT bao g·ªìm thu·∫ø ph√≠. Ng∆∞·ªùi mua l√† nh√† s∆∞u t·∫≠p Trung Qu·ªëc. Tranh l·∫≠p k·ª∑ l·ª•c t√°c ph·∫©m ngh·ªá thu·∫≠t Trung Qu·ªëc ƒë·∫Øt gi√° nh·∫•t. T·ªÅ B·∫°ch Th·∫°ch tr·ªü th√†nh danh h·ªça Tru

In [40]:
import random

vietnamese_characters = "aƒÉ√¢e√™io√¥∆°u∆∞y√°√†·∫£√£·∫°·∫•·∫ß·∫©·∫´·∫≠·∫Ø·∫±·∫≥·∫µ·∫∑√©√®·∫ª·∫Ω·∫π·∫ø·ªÅ·ªÉ·ªÖ·ªá√≠√¨·ªâƒ©·ªã√≥√≤·ªè√µ·ªç·ªë·ªì·ªï·ªó·ªô·ªõ·ªù·ªü·ª°·ª£√∫√π·ªß≈©·ª•·ª©·ª´·ª≠·ªØ·ª±√Ω·ª≥·ª∑·ªπ·ªµ"
keyboard_mistakes = {
	'a': '√¢ƒÉ', 'ƒÉ': 'a', '√¢': 'a',
	'e': '√™', '√™': 'e',
	'i': '√≠√¨',
	'o': '√¥∆°', '√¥': 'o', '∆°': 'o',
	'u': '∆∞', '∆∞': 'u',
	'd': 'ƒë', 'ƒë': 'd'
}

def random_typo(word):
	# Choosen error type
	error_type = random.choice(['accent', 'keyboard', 'homophone'])

	if error_type == 'accent':
		for i in range(len(word)):
			if word[i] in vietnamese_characters:
				new_char = random.choice(vietnamese_characters)
				word = word[:i] + new_char + word[i+1:]
				break

	elif error_type == 'keyboard':
		for i in range(len(word)):
			if word[i] in keyboard_mistakes:
				new_char = random.choice(keyboard_mistakes[word[i]])
				word = word[:i] + new_char + word[i+1:]
				break

	elif error_type == 'homophone':
		homophones = {'d': 'gi', 'gi': 'd', 'tr': 'ch', 'ch': 'tr', 's': 'x', 'x': 's'}
		for key in homophones:
			if key in word:
				word = word.replace(key, homophones[key], 1)
				break

	return word

In [43]:
# Generate mistakes in text
def generate_mistakes(text, error_rate=0.1):
	words = text.split()
	new_words = []
	for word in words:
		if random.random() < error_rate:
			word = random_typo(word)
		new_words.append(word)
	return ' '.join(new_words)

error_data = generate_mistakes(original_data, error_rate=0.2) # 20% error rate
print(error_data)

Ki·ªát t√°c h∆°n 140 tr·ªπ·ªáu USD c·ªßa T·ª∑ B·∫°ch Th·∫°ch "Th·∫≠p nh·ªá phong c·∫£nh ƒë·ªì" c·ªßa T·ªÅ B·∫°ch Th·∫°tr t·ª´ng b√°n gi√° 140,8 tri·ªáu USD - ƒë·∫Øt nh·∫•t Trung Q·ª≠·ªëc.Thevalue c√¥ng b·ªë danh s√°ch "10 t√°c ph·∫©m ngh·ªá thu·∫≠t ƒë·∫Øt nh·∫•t ƒë∆∞·ª£c b√°n ƒë·∫•u d√°" h·ªìi cu·ªëi th√°ng 11, sau kh√≠ c·∫≠p nh·∫≠t nh·ªØng t·ª±c ph·∫©m ƒëo·∫°t g·ª©√° cao trong nam, Th·∫≠p nh·ªã phong c·∫£nh ƒë·ªì c·ªßa T·ªÅ B·∫°ch Th·∫°ch ƒë·ª©ng th·ª© t√°m v·ªõi m·ª©c 931,5 tri·ªáu nh√¢n d√¢n t·ªá (kho·∫£ng 140,8 tri·ªáu USD).Con s·ªë n√†y ƒë∆∞·ª£c ·∫•n d·ªãnh trong phi√™n d·∫•u c·ªßa Poly B·∫Øc Kinh h·ªìi th√°ng 12/2017. T√°c ph·∫©m c√≥ m·ª©c gi√° kh·ªüi ƒëi·ªÉm l√† 450 tr·∫£·ªáu NDT, sau h∆°n 20 ph√∫t v·ªõ√≠ h∆°n 60 l∆∞·ª£t ƒë·∫∑t g·ª±√°, t√°c ph·∫©m ƒë∆∞·ª£c ch·ªët ·ªü m·ª©c 931,5 tr√¨·ªáu NDT bao g·ªìm th√Ω·∫ø ph√≠. Ng∆∞·ªùi m∆∞a l√† nh√† s∆∞u t·∫≠p Tr∆∞ng Qu·ªëc. Tranh l·∫≠p k·ª∑ l·ª•c t√°c ph·∫©m ngh·ªá thu·∫≠t Trung Qu·ªëc ƒë·∫Øt gi√° nh·∫•t. T·ªÅ B·∫°ch Th·∫°ch tr·ªü th√†nh g

In [44]:
import re
import pandas as pd

def extract_phrases(text):
	return re.findall(r'\w[\w ]+', text)

original_phrases = extract_phrases(original_data)
error_phrases = extract_phrases(error_data)

def prepare_data_for_t5(original_phrases, error_phrases):
	data = {"source": [], "target": []}
	for original, error in zip(original_phrases, error_phrases):
		data["source"].append(error)
		data["target"].append(original)
	df = pd.DataFrame(data)
	return df

data = prepare_data_for_t5(original_phrases, error_phrases)
data.head()

Unnamed: 0,source,target
0,Ki·ªát t√°c h∆°n 140 tr·ªπ·ªáu USD c·ªßa T·ª∑ B·∫°ch Th·∫°ch,Ki·ªát t√°c h∆°n 140 tri·ªáu USD c·ªßa T·ªÅ B·∫°ch Th·∫°ch
1,Th·∫≠p nh·ªá phong c·∫£nh ƒë·ªì,Th·∫≠p nh·ªã phong c·∫£nh ƒë·ªì
2,c·ªßa T·ªÅ B·∫°ch Th·∫°tr t·ª´ng b√°n gi√° 140,c·ªßa T·ªÅ B·∫°ch Th·∫°ch t·ª´ng b√°n gi√° 140
3,8 tri·ªáu USD,8 tri·ªáu USD
4,ƒë·∫Øt nh·∫•t Trung Q·ª≠·ªëc,ƒë·∫Øt nh·∫•t Trung Qu·ªëc


In [45]:
from pyvi import ViTokenizer

# Tokenize Vietnamese text
def tokenize(text):
	return ViTokenizer.tokenize(text)

data['source'] = data['source'].apply(tokenize)
data['target'] = data['target'].apply(tokenize)
data.head()

Unnamed: 0,source,target
0,Ki·ªát_t√°c h∆°n 140 tr·ªπ·ªáu USD c·ªßa T·ª∑ B·∫°ch_Th·∫°ch,Ki·ªát_t√°c h∆°n 140 tri·ªáu USD c·ªßa T·ªÅ B·∫°ch_Th·∫°ch
1,Th·∫≠p_nh·ªá phong_c·∫£nh_ƒë·ªì,Th·∫≠p_nh·ªã phong_c·∫£nh_ƒë·ªì
2,c·ªßa T·ªÅ B·∫°ch_Th·∫°tr t·ª´ng b√°n gi√° 140,c·ªßa T·ªÅ B·∫°ch_Th·∫°ch t·ª´ng b√°n gi√° 140
3,8 tri·ªáu USD,8 tri·ªáu USD
4,ƒë·∫Øt nh·∫•t Trung_Q·ª≠·ªëc,ƒë·∫Øt nh·∫•t Trung_Qu·ªëc


In [46]:
from datasets import Dataset

dataset = Dataset.from_pandas(data).train_test_split(test_size=0.2)

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 488
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 122
    })
})


#### Model Training

In [53]:
from transformers import T5TokenizerFast

tokenizer = T5TokenizerFast.from_pretrained('t5-base')
prefix = "Correction: "

def tokenize_function(examples):
	inputs = [prefix + doc for doc in examples['source']]
	targets = examples['target']
	model_inputs = tokenizer(inputs, max_length=256, padding="max_length", truncation=True)
	labels = tokenizer(targets, max_length=256, truncation=True, padding="max_length")
	model_inputs['labels'] = labels['input_ids']
	return model_inputs

tokenized_dataset = dataset.map(tokenize_function, batched=True)
print(tokenized_dataset)


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 488/488 [00:00<00:00, 6284.45 examples/s]

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 122/122 [00:00<00:00, 8653.02 examples/s]

DatasetDict({
    train: Dataset({
        features: ['source', 'target', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 488
    })
    test: Dataset({
        features: ['source', 'target', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 122
    })
})





In [54]:
import torch

if torch.backends.mps.is_available():
	device = torch.device("mps")
else:
	device = torch.device("cpu")

print(f"Using device: {device}")

Using device: mps


In [55]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments

model = T5ForConditionalGeneration.from_pretrained('t5-base')
model.to(device)

training_args = TrainingArguments(
	output_dir='./results',
	evaluation_strategy='epoch',
	learning_rate=5e-5,
	per_device_train_batch_size=4,
	per_device_eval_batch_size=4,
	num_train_epochs=3,
	weight_decay=0.01,
	push_to_hub=False,
)

trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=tokenized_dataset['train'],
  eval_dataset=tokenized_dataset['test']
)

trainer.train()

  4%|‚ñç         | 7/183 [08:03<3:22:46, 69.13s/it]
 33%|‚ñà‚ñà‚ñà‚ñé      | 122/366 [04:41<13:15,  3.26s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

 33%|‚ñà‚ñà‚ñà‚ñé      | 122/366 [04:53<13:15,  3.26s/it]
[A
[A

{'eval_loss': 0.0630718395113945, 'eval_runtime': 12.8528, 'eval_samples_per_second': 9.492, 'eval_steps_per_second': 2.412, 'epoch': 1.0}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 244/366 [09:49<03:58,  1.96s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 244/366 [10:04<03:58,  1.96s/it]
[A
[A

{'eval_loss': 0.05569525808095932, 'eval_runtime': 14.3159, 'eval_samples_per_second': 8.522, 'eval_steps_per_second': 2.165, 'epoch': 2.0}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 366/366 [14:06<00:00,  1.99s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 366/366 [14:21<00:00,  1.99s/it]
[A
[A
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 366/366 [14:21<00:00,  2.35s/it]

{'eval_loss': 0.054083675146102905, 'eval_runtime': 14.6148, 'eval_samples_per_second': 8.348, 'eval_steps_per_second': 2.121, 'epoch': 3.0}
{'train_runtime': 861.6268, 'train_samples_per_second': 1.699, 'train_steps_per_second': 0.425, 'train_loss': 0.45499682817302767, 'epoch': 3.0}





TrainOutput(global_step=366, training_loss=0.45499682817302767, metrics={'train_runtime': 861.6268, 'train_samples_per_second': 1.699, 'train_steps_per_second': 0.425, 'total_flos': 445757175889920.0, 'train_loss': 0.45499682817302767, 'epoch': 3.0})

#### Inference Model

In [56]:
def correct_spelling(input_text, model, tokenizer):
	# input_text_segmented = ViTokenizer.tokenize(input_text)
	input_with_prefix = "Correction: " + input_text
	inputs = tokenizer(input_with_prefix, return_tensors="pt", max_length=128, truncation=True, padding=True)
	inputs = {k: v.to(device) for k, v in inputs.items()}
	outputs = model.generate(inputs["input_ids"], max_length=256)
	corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
	return corrected_text.replace('_', ' ')  # Remove underscores after tokenization

input_text = "vƒÉn b·∫£n b·ªã l·ªïi ch√≠nh t√£"
corrected_text = correct_spelling(input_text, model, tokenizer)
print(corrected_text)

VƒÉn bn b li chnh t


#### Evaluate model

In [57]:
from datasets import load_metric

# Load BLEU metric
bleu = load_metric('bleu')

def evaluate_model(model, tokenizer, dataset):
	model.eval()
	predictions = []
	references = []

	for example in dataset:
		inputs = tokenizer(example['source'], return_tensors='pt', max_length=128, truncation=True, padding="max_length")
		inputs = {k: v.to(device) for k, v in inputs.items()}
		output = model.generate(inputs['input_ids'], max_length=256, num_beams=4, early_stopping=True)
		decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
		predictions.append(decoded_output.split())
		references.append([example['target'].split()])

	return bleu.compute(predictions=predictions, references=references)

# Evaluate the model on test dataset
bleu_score = evaluate_model(model, tokenizer, dataset['test'])
print(f"BLEU score: {bleu_score['bleu']}")

  bleu = load_metric('bleu')
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


BLEU score: 0.03091260249784252
