<a href="https://colab.research.google.com/github/justxoai/NLP-Grammaly/blob/main/GrammarCorrection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

todo: train again and push result to gdrive

# Data extraction
Requires GDrive to has `C4_200M.tsv-00000-of-00010.zip` at `MyDrive/Dataset/`. This will extract and split the dataset into 3 parts: train, val, test with the ratio 14:3:3. After that, the 3 subdataset will be saved to GDrive at `MyDrive/`




## Setup enviroment

In [None]:
# Mount GDrive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import zipfile
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
# constants
zip_name = 'C4_200M.tsv-00000-of-00010'
zip_path = '/content/drive/MyDrive/Dataset/{}{}'.format(zip_name, '.zip')
extract_dir = '/content'
extract_path = '/content/{}'.format(zip_name)
col_headers = ['Input', 'Target']

nrows = 200 * 1000

## Zip to DataFrame

In [None]:
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)
print(f"Extracted: {zip_name} to {extract_dir}")

In [None]:
from_tsv = pd.read_csv(extract_path, sep='\t', nrows=nrows)
columns = list(from_tsv.columns)
from_tsv = from_tsv.rename(columns={columns[0]: col_headers[0], columns[1]: col_headers[1]})

### Missing data completion

In [22]:
from_tsv.isna().sum()

NameError: name 'from_tsv' is not defined

In [None]:
def fill_missing(row):
    if pd.isna(row[col_headers[0]]) and pd.isna(row[col_headers[1]]):  # Both are null
        row[col_headers[0]], row[col_headers[1]] = "", ""
    elif pd.isna(row[col_headers[0]]):  # col_headers[0] is null, assign col_headers[1]'s value to col_headers[0]
        row[col_headers[0]] = row[col_headers[1]]
    elif pd.isna(row[col_headers[1]]):  # col_headers[1] is null, assign col_headers[0]'s value to col_headers[1]
        row[col_headers[1]] = row[col_headers[0]]
    return row

# Apply the function row-wise
from_tsv = from_tsv.apply(fill_missing, axis=1)

In [None]:
from_tsv.isna().sum()

## Split and save to drive

In [None]:
train_df, test_df = train_test_split(from_tsv, test_size=0.3)
val_df, test_df = train_test_split(test_df, test_size = 0.5)
print(train_df.shape)
print(val_df.shape)
print(test_df.shape)
train_df.to_csv('/content/drive/MyDrive/dataset_train.csv', index=False)
val_df.to_csv('/content/drive/MyDrive/dataset_val.csv', index=False)
test_df.to_csv('/content/drive/MyDrive/dataset_test.csv', index=False)

# Model training

## Setup environment

In [1]:
# Mount GDrive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!pip -q install evaluate rouge_score

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [27]:
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer)
from tqdm.notebook import tqdm
import evaluate
import numpy as np
import torch
import shutil

In [6]:
# constants
col_headers = ['Input', 'Target']
model_pretrained = 't5-base'
model_trained = 'my_fine_tuned_t5_base_model'
return_tensors = 'pt'

## Setup helper

In [7]:
def encode(
    src_df: pd.DataFrame,
    tokenizer: AutoTokenizer,
    src_max_length: int = 334,
    tgt_max_length: int = 128,
    return_tensors: str = None,
    batch_size: int = 32
):
    """
    Optimized function to encode source and target text from a DataFrame.

    Args:
        src_df (pd.DataFrame): DataFrame containing col_headers[0] and col_headers[1] columns.
        tokenizer (AutoTokenizer): Tokenizer from Hugging Face Transformers.
        src_max_length (int): Maximum length for source text. Default is 334.
        tgt_max_length (int): Maximum length for target text. Default is 128.
        return_tensors (str): If 'pt', returns PyTorch tensors. If 'tf', returns TensorFlow tensors.
        batch_size (int): Number of samples to process in a batch. Default is 32.

    Returns:
        List[Dict]: A list of dictionaries containing tokenized input IDs, attention masks, and labels.
    """
    document_encoded = []

    # Process in batches for efficiency
    for i in tqdm(range(0, len(src_df), batch_size), desc="Encoding documents"):
        batch = src_df.iloc[i:i + batch_size]
        src_texts = batch[col_headers[0]].tolist()
        tgt_texts = batch[col_headers[1]].tolist()

        # Tokenize source texts
        encoded_input = tokenizer(
            src_texts,
            padding=True,
            truncation=True,
            max_length=src_max_length,
            return_tensors=return_tensors
        )

        # Tokenize target texts
        encoded_target = tokenizer(
            tgt_texts,
            padding=True,
            truncation=True,
            max_length=tgt_max_length,
            return_tensors=return_tensors
        )

        # Append encoded data for each document in the batch
        for j in range(len(batch)):
            encoded = {
                'input_ids': encoded_input['input_ids'][j],
                'attention_mask': encoded_input['attention_mask'][j],
                'labels': encoded_target['input_ids'][j]
            }
            document_encoded.append(encoded)

    return document_encoded

In [14]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred: tuple):
    predictions, labels = eval_pred
    pred_decoded = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    labels = np.where(labels != -100,labels,tokenizer.pad_token_id)

    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Computes the ROUGE metric between the decoded predictions and decoded labels.
    # The use_stemmer parameter enables stemming, which reduces words to their root form before comparison.
    result = rouge.compute(predictions=pred_decoded, references=decoded_labels, use_stemmer=True)

    # Calculates the length of each prediction by counting the non-padding tokens.
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]

    # Computes the mean length of the predictions and adds it to the result dictionary under the key "gen_len".
    result["gen_len"] = np.mean(prediction_lens)

    # Rounds each value in the result dictionary to 4 decimal places for cleaner output, and returns the result.
    return {k: round(v, 4) for k, v in result.items()}

## Prepare trainer

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_pretrained)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [9]:
train_df = pd.read_csv('/content/drive/MyDrive/dataset_train.csv')
val_df = pd.read_csv('/content/drive/MyDrive/dataset_val.csv')

In [10]:
train_encoded = encode(src_df=train_df, tokenizer=tokenizer, return_tensors=return_tensors)
val_encoded = encode(src_df=val_df, tokenizer=tokenizer, return_tensors=return_tensors)

Encoding documents:   0%|          | 0/4375 [00:00<?, ?it/s]

Encoding documents:   0%|          | 0/938 [00:00<?, ?it/s]

In [11]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_pretrained)

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [12]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model_pretrained,
    padding='longest',
    return_tensors=return_tensors)

In [19]:
training_args = Seq2SeqTrainingArguments(
    output_dir=model_trained,
    evaluation_strategy="epoch",
    eval_steps=500,
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=True,
    logging_dir="./logs",
    report_to="tensorboard",
    save_strategy="epoch",
    warmup_steps=500,
    gradient_accumulation_steps=2,
    load_best_model_at_end=True,
    metric_for_best_model="rougeL",
    greater_is_better=True,
)



In [20]:
torch.cuda.empty_cache()
print(model.device)  # Should print "cuda:0" or similar
print(next(model.parameters()).is_cuda)  # Should print "True"

cpu
False


In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset = train_encoded,
    eval_dataset = val_encoded,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

## Train

In [21]:
trainer.train()

  trainer = Seq2SeqTrainer(
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.1922,0.178922,0.7263,0.628,0.719,0.7189,17.9612
2,0.1823,0.174968,0.7276,0.6302,0.7203,0.7203,17.9445


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=8750, training_loss=0.3340732740129743, metrics={'train_runtime': 12376.3463, 'train_samples_per_second': 22.624, 'train_steps_per_second': 0.707, 'total_flos': 7.953543821795328e+16, 'train_loss': 0.3340732740129743, 'epoch': 2.0})

In [23]:
print(model.device)  # Should print "cuda:0" or similar
print(next(model.parameters()).is_cuda)  # Should print "True"

cuda:0
True


## Save to GDrive

In [28]:
shutil.copytree('/content/{}'.format(model_trained), '/content/drive/MyDrive/{}'.format(model_trained))

'/content/drive/MyDrive/my_fine_tuned_t5_base_model'

todo: zip

In [None]:
shutil.make_archive('/content/{}'.format(model_trained), 'zip', '/content/{}'.format(model_trained))

In [None]:
shutil.copy('/content/{}'.format(model_trained), '/content/drive/MyDrive/{}'.format(model_trained))

# Model loading
Loads an existing model from GDrive

## Setup environment

In [None]:
# Mount GDrive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import shutil
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer)

In [None]:
# constants
model_trained = 'my_fine_tuned_t5_base_model'

## Get model from GDrive

todo: get zip and extract instead

In [7]:
shutil.copytree('/content/drive/MyDrive/{}'.format(model_trained), '/content/{}'.format(model_trained))

'/content/my_fine_tuned_t5_base_model'

In [10]:
model = AutoModelForSeq2SeqLM.from_pretrained('/content/drive/MyDrive/{}/{}'.format(model_trained, 'checkpoint-8750'))
tokenizer = AutoTokenizer.from_pretrained('/content/drive/MyDrive/{}/{}'.format(model_trained, 'checkpoint-8750'))

# Model Testing
Requires a newly trained model or an existing model from GDrive. See two methods above.

## Setup environment

In [17]:
import re
import torch

In [6]:
# constants
# truncation
# padding
# max_length
# return_tensors
model_trained = 'my_fine_tuned_t5_base_model'

## Setup helper

In [18]:
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [29]:
def correct_sentence(sentence):
    tokens = tokenizer(
        sentence,
        truncation=True,
        padding='max_length',
        max_length=512,
        return_tensors="pt"
    ).to(torch_device)

    output_ids = model.generate(
        **tokens,
        max_length=512,
        num_beams=4,
        num_return_sequences=1,
        temperature=1.5)

    output_text = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    return output_text[0]

In [30]:
print(correct_sentence('he are an teacher.'))

he is a teacher.


In [31]:
def split_into_sentences(paragraph):
    sentences = re.split(r'(?<=[.!?]) +', paragraph)  # Split on punctuation followed by space
    return sentences

In [32]:
def correct_paragraph(paragraph):
    sentences = split_into_sentences(paragraph)
    corrected_sentences = []
    for sentence in sentences:
        corrected = correct_sentence(sentence)
        corrected_sentences.append(corrected)  # Take the first corrected version
    return " ".join(corrected_sentences)

In [33]:
print(correct_paragraph('hello everyon. my name is At. im shit at NLp. please help moi.'))

hello everyone. my name is At. im shit at NLp. please help me.


## Testing process

# Install Libraries



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip -q install transformers

In [None]:
!pip -q install datasets

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/484.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/143.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/194.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip -q install evaluate rouge_score

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [None]:
!pip -q install rouge

In [None]:
!pip -q install torch

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m54.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!pip -q install pytorch-lightning

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/819.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m819.3/819.3 kB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/927.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m927.3/927.3 kB[0m [31m55.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip -q install pytorch-ignite

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/312.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m312.7/312.7 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install -q sentencepiece

In [None]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
import string

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

from typing import Tuple, List, Dict

import evaluate
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from tqdm.notebook import tqdm
import operator
from ignite.handlers import ModelCheckpoint
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

from transformers import (AutoModelForSeq2SeqLM,
                          AutoTokenizer,
                          DataCollatorForSeq2Seq,
                          pipeline,
                          Seq2SeqTrainingArguments,
                          Seq2SeqTrainer)

import tensorflow_datasets

seed = 42
device = torch.device('cuda')

os.environ["WANDB_DISABLED"] = "true"

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
  from torch.distributed.optim import ZeroRedundancyOptimizer


In [None]:
trainer.save_model('correction')

#Testing

## Text thử

In [None]:
print(torch.cuda.is_available())

True


In [None]:
correction = pipeline("text2text-generation", model='correction', device=-1, truncation=True)

Device set to use cpu


In [None]:
incorrect_text = train_df.iloc[0]['Input']
correction(incorrect_text)[0]['generated_text']

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


'Champagne Moment – Magesh’s sounds so cool caught and bowled.'

##Text chính

### TH1: Bình thường

#### Text ngắn

In [13]:
torch_device = 'cuda'

def correct_grammar(input_text,num_return_sequences=1):
  batch = tokenizer([input_text],truncation=True,padding='max_length',max_length=512, return_tensors="pt").to('cpu')
  translated = model.generate(**batch,max_length=512,num_beams=4, num_return_sequences=num_return_sequences, temperature=1.5)
  tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
  return tgt_text

In [14]:
text = 'They could culture more land and grows food a lot more.'
print(correct_grammar(text, num_return_sequences=3))  # Generate 3 possible corrections
 # (Thường xác xuất tạo các kết quả đúng khác nhau là rất nhỏ và thêm nữa đa số các kết quả sau thường sai chỉ có kết quả đầu đúng nên khuyến khích dùng 1 cái thôi :///)



['They could cultivate more land and grow food a lot more.', 'They could culture more land and grow food a lot more.', 'They could grow more land and grow food a lot more.']


In [15]:
text = 'he are an teachers'
print(correct_grammar(text, num_return_sequences=1))

['he is a teacher.']


In [None]:
text = """These art forms start with sologans to find the talent, but from what I’ve observed, they just entertaiment. """

print(correct_grammar(text, num_return_sequences= 1))

['These art forms start with solo artists to find the talent, but from what I’ve observed, they just entertaiment.']


#### Text bị dài




In [None]:
torch_device = 'cuda'

def correct_grammar(input_text, num_return_sequences):
    # Hàm chia văn bản thành các đoạn nhỏ
    def split_text(text, max_length):
        words = text.split()
        chunks = []
        current_chunk = []

        for word in words:
            # Kiểm tra nếu từ tiếp theo có thể thêm vào đoạn hiện tại
            if len(" ".join(current_chunk + [word])) <= max_length:
                current_chunk.append(word)
            else:
                chunks.append(" ".join(current_chunk))
                current_chunk = [word]
        if current_chunk:
            chunks.append(" ".join(current_chunk))  # Thêm đoạn còn lại
        return chunks

    # Chia văn bản thành các đoạn nhỏ không vượt quá max_length
    chunks = split_text(input_text, max_length=512)

    # Xử lý từng đoạn nhỏ và gộp kết quả lại
    all_translated = []

    for chunk in chunks:
        batch = tokenizer([chunk], truncation=True, padding='max_length', max_length=512, return_tensors="pt").to(torch_device)
        translated = model.generate(**batch, max_length=512, num_beams=4, num_return_sequences=num_return_sequences, temperature=1.5)
        tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
        all_translated.append(tgt_text[0])  # Giả sử mỗi batch chỉ trả về 1 câu dịch

    # Nối tất cả các đoạn lại thành văn bản hoàn chỉnh
    return " ".join(all_translated)


text =  """Today gift shows are popular in many countries, and purpose of these shows finds talented people, and help them to introduce themselves to each other .Actually, many people now watch this shows, and during this years find more fans that cause increase the Viewer, and many sponsors Keen on for sponsoring this shows, because gift shows has benefits for them, and this programs convert to tools that earn money, and present their services.

Firstly, result this programme has  a massive effect on the society, because many people get a chance to represent their gift. On the other hand, many people have gift, but they do not know, so they have the opportunity to find their gift, and encourage them to follow their interests.

secondly, many audiences, and viewers watch this shows, so it is a big chance for companies by sponsoring in this program. They can find new customers and introduce their services to each other.For instance, they commercials between the shows certify this issue.Furthermore TV is one of the tools that entertain people, although the target finds gift, so part of this shows for entertaining people.

As a result, the aim of  producing this shows impressive, so part of the society following this shows for entertaining, and the part of the people persuade to find their talents. In fact, this topic has two side that everyone can according to own opinion.

"""
print(correct_grammar(text, num_return_sequences=1))

Today gift shows are popular in many countries, and purpose of these shows finds talented people, and helps them to introduce themselves to each other.Actually, many people now watch this shows, and during this years find more fans that cause increase the Viewer, and many sponsors Keen on for sponsoring this shows, because gift shows has benefits for them, and these programs convert to tools that earn money, and present their services. Firstly, result this programme has a massive effect on the society. because many people get a chance to represent their gift. On the other hand, many people have gift, but they do not know, so they have the opportunity to find their gift, and encourage them to follow their interests. secondly, many audiences, and viewers watch this shows, so it is a big chance for companies to sponsor in this program. They can find new customers and introduce their services to each other.For instance, the commercials between the shows certify this issue. tools that enter

### TH2: Khác thường (Lỗi dấu câu và viết tắt)

In [None]:
!pip install contractions # Thư viện giúp cho fix chữ viết tắt đơn giản

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (118 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.3/118.3 kB[0m 

In [None]:
import contractions

# Viết hoa chữ đầu câu
def capitalize_first_letter(text):
    sentences = re.split(r'([.!?])', text)
    capitalized_sentences = []
    for sentence in sentences:
        sentence = sentence.strip() # Loại bỏ khoảng trắng dư thừa
        if sentence:
            capitalized_sentences.append(sentence[0].upper() + sentence[1:] if sentence else '')
        else:
            capitalized_sentences.append('')

    # Ghép các câu lại thành một đoạn văn hoàn chỉnh
    return ' '.join(capitalized_sentences).replace(' ,', ',').replace(',', ', ').replace(' .', '.').replace(' ?', '?').replace(' !', '!')


# Hàm chia văn bản thành các đoạn nhỏ
def split_text(text, max_length):
    words = text.split()
    chunks = []
    current_chunk = []

    for word in words:
        if len(" ".join(current_chunk + [word])) <= max_length:
            current_chunk.append(word)
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]

    if current_chunk:
        chunks.append(" ".join(current_chunk))  # Thêm đoạn còn lại

    return chunks


# Correct Grammar
def correct_grammar(input_text, num_return_sequences):
    # Chuẩn hóa dấu câu - Loại bỏ dấu cùng loại dư thừa
    normalized_text = re.sub(r',\s*,+', ',', input_text)
    normalized_text = re.sub(r'\.\s*\.+', '.', normalized_text)
    normalized_text = re.sub(r'\!\s*\!+', '.', normalized_text)
    normalized_text = re.sub(r'\?\s*\?+', '.', normalized_text)

    # Viết tắt - Mở rộng các từ viết tắt
    expanded_text = contractions.fix(normalized_text)

    # Chia văn bản thành các đoạn nhỏ không vượt quá max_length
    chunks = split_text(expanded_text, max_length=512)
    corrected_texts = []

    for chunk in chunks:
        # Tokenize và encode
        batch = tokenizer([chunk], truncation=True, padding='max_length', max_length=512, return_tensors="pt").to(torch_device)

        # Generate corrected text
        translated = model.generate(
            **batch,
            max_length=512,
            num_beams=4,
            num_return_sequences=num_return_sequences,
            temperature=1.5
        )
        tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)

        for text in tgt_text:
            text = text.lower()
            text = capitalize_first_letter(text)
            corrected_texts.append(text)

    # Ghép các đoạn lại thành văn bản hoàn chỉnh
    return " ".join(corrected_texts)

In [None]:
# Example usage
input_text = """I can't believe it's already December,time flies so fast! I haven't seen him since last year, he probably won't come to the party.Btw,Do you think she is going to make it? I don't know, but she's been really busy lately, so maybe she won't. Also, I heard that they're planning a surprise for us, but I don't know if it'll be a good idea... What do you think about that? I think it's gonna be great, though! I just hope everyone can come."""
num_return_sequences = 1

corrected_text = correct_grammar(input_text, num_return_sequences=num_return_sequences)

# Display results
print(f"Corrected Text: {corrected_text}")

Corrected Text: I cannot believe it is already december, time flies so fast! I have not seen him since last year,  he probably will not come to the party. By the way, do you think she is going to make it? I do not know,  but she has been really busy lately,  so maybe she will not. Also,  i heard that they are planning a surprise for us,  but i do not know if it will be a good idea. What do you think about that? I think it is going to be great,  though! I just hope everyone can come. 
