<a href="https://colab.research.google.com/github/justxoai/NLP-Grammaly/blob/main/NLP_GrammarAutocorrector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Libraries

In [None]:
# transformers library
!pip install -q transformers
# evaluate rouge_score
!pip install -q evaluate rouge_score
# rouge library
!pip install -q rouge
# torch library
!pip install -q torch

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m36.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━

<h1> Import library </h1>

In [None]:
import os
import random
import re

import nltk
nltk.download('punkt')

import evaluate
import pandas as pd
import numpy as np
import torch

from tqdm.notebook import tqdm

from transformers import (AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer)

import shutil

import zipfile

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


<h1> Connect Google Colab with Google Drive </h1>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Data extraction

<h1> Select Seed and Computing Device </h1>

In [None]:
seed = 42
device = torch.device('cuda')

os.environ["WANDB_DISABLED"] = "true"

<h1> Extracting Dataset files from Google Drive </h1>

In [None]:
# Folder containing ZIP files in Google Drive
drive_dataset_folder = "/content/drive/MyDrive/NLP/Dataset"

# Destination folder for extracted data in `/content`
content_dataset_folder = "/content/NLP_Dataset"
os.makedirs(content_dataset_folder, exist_ok=True)  # Create folder if it doesn't exist

# Find and extract all ZIP files in the dataset folder
zip_files = [f for f in os.listdir(drive_dataset_folder) if f.endswith(".zip")]
for zip_file in zip_files:
    zip_path = os.path.join(drive_dataset_folder, zip_file)
    extract_path = content_dataset_folder  # Extract to "/content/NLP_Dataset"
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    print(f"Extracted: {zip_file} to {extract_path}")


Extracted: C4_200M.tsv-00001-of-00010.zip to /content/NLP_Dataset
Extracted: C4_200M.tsv-00002-of-00010.zip to /content/NLP_Dataset
Extracted: C4_200M.tsv-00003-of-00010.zip to /content/NLP_Dataset
Extracted: C4_200M.tsv-00005-of-00010.zip to /content/NLP_Dataset
Extracted: C4_200M.tsv-00008-of-00010.zip to /content/NLP_Dataset
Extracted: C4_200M.tsv-00004-of-00010.zip to /content/NLP_Dataset
Extracted: C4_200M.tsv-00000-of-00010.zip to /content/NLP_Dataset
Extracted: C4_200M.tsv-00009-of-00010.zip to /content/NLP_Dataset
Extracted: C4_200M.tsv-00007-of-00010.zip to /content/NLP_Dataset
Extracted: C4_200M.tsv-00006-of-00010.zip to /content/NLP_Dataset


<h1> Merge Dataframe </h1>

In [None]:
# Number of rows to read from each dataset
num_rows = 10000

# Get a list of .tsv files from `/content/NLP_Dataset`
data_files = []
for root, _, files in os.walk("/content/NLP_Dataset"):
    for file in files:
        if ".tsv" in file.lower():
            data_files.append(os.path.join(root, file))

print("Found TSV files:", data_files)  # Check detected .tsv files

# Read data from .tsv files
dfs = []  # List to store dataframes
col_headers = ["Input", "Target"]  # Column headers

# Import data from each .tsv file
for raw_df_file in tqdm(
    data_files,
    total=len(data_files),
    desc="Importing Dataframes",
    unit="tsv files"
):
    new_df = pd.read_csv(raw_df_file, sep="\t", nrows=num_rows)

    # Rename columns if at least two exist
    columns = list(new_df.columns)
    if len(columns) >= 2:
        new_df = new_df.rename(columns={columns[0]: col_headers[0], columns[1]: col_headers[1]})

    dfs.append(new_df)

# Merge all dataframes if any exist
if dfs:
    df = pd.concat(dfs, axis=0)
    del dfs  # Free up memory
    print("Merged dataframe shape:", df.shape)
    display(df.head())  # Show first few rows
else:
    print("No data found to merge.")


Found TSV files: ['/content/NLP_Dataset/C4_200M.tsv-00006-of-00010', '/content/NLP_Dataset/C4_200M.tsv-00005-of-00010', '/content/NLP_Dataset/C4_200M.tsv-00008-of-00010', '/content/NLP_Dataset/C4_200M.tsv-00001-of-00010', '/content/NLP_Dataset/C4_200M.tsv-00007-of-00010', '/content/NLP_Dataset/C4_200M.tsv-00003-of-00010', '/content/NLP_Dataset/C4_200M.tsv-00009-of-00010', '/content/NLP_Dataset/C4_200M.tsv-00002-of-00010', '/content/NLP_Dataset/C4_200M.tsv-00000-of-00010', '/content/NLP_Dataset/C4_200M.tsv-00004-of-00010']


Importing Dataframes:   0%|          | 0/10 [00:00<?, ?tsv files/s]

Merged dataframe shape: (100000, 2)


Unnamed: 0,Input,Target
0,12/21/2015 Responce Download EPA opposed petit...,12/21/2015 Response Download EPA opposed petit...
1,I will explain toleratly Math points of in the...,I will explain in full detail to the chosen pr...
2,Amit Sheth. Content management; metadata with ...,"Amit Sheth. Content Management, Metadata and S..."
3,Get A booster This Summer!,Get A Boost This Summer!
4,you are locked out of most other actions; whil...,You are locked out of most other actions while...


# Model Training

<h1> Import ML Library </h1>

In [None]:
from sklearn.model_selection import train_test_split

<h1> Split the dataset into 3 parts: train, val, test </h1>

<h1> Encoding dataset </h1>

In [None]:
# Dataframe split into three subdataframe: train, test and validation
train_df, test_df = train_test_split(df, test_size=0.30)
val_df, test_df = train_test_split(test_df, test_size = 0.5)

In [None]:
def encode(src_df: pd.DataFrame, tokenizer: AutoTokenizer):
    """
    Encode input and target text using the given tokenizer.
    """
    document_encoded = []

    # Iterate through each document in the DataFrame
    for _, document in tqdm(src_df.iterrows(), total=len(src_df), desc="Encoding documents"):
        src_document = document['Input']
        target_document = document['Target']

        # Tokenize input text
        encoded_input = tokenizer(
            src_document,
            padding=True,
            truncation=True,
            max_length=334
        )

        # Tokenize target text
        encoded_target = tokenizer(
            target_document,
            padding=True,
            truncation=True,
            max_length=128
        )

        # Store tokenized results
        encoded = {
            'input_ids': encoded_input['input_ids'],
            'attention_mask': encoded_input['attention_mask'],
            'labels': encoded_target['input_ids']
        }

        document_encoded.append(encoded)

    return document_encoded

<h1> Load Token </h1>

In [None]:
# Load the tokenizer from the pretrained T5 model
tokenizer = AutoTokenizer.from_pretrained('t5-base')

# Encode the dataset
encoded_data = encode(src_df=df, tokenizer=tokenizer)

# Display the length of tokenized sequences for the first 5 samples
for enc in encoded_data[:5]:
    print(f"Input IDs length: {len(enc['input_ids'])}, "
          f"Attention Mask length: {len(enc['attention_mask'])}, "
          f"Labels length: {len(enc['labels'])}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Encoding documents:   0%|          | 0/100000 [00:00<?, ?it/s]

Input IDs length: 25, Attention Mask length: 25, Labels length: 23
Input IDs length: 16, Attention Mask length: 16, Labels length: 13
Input IDs length: 34, Attention Mask length: 34, Labels length: 31
Input IDs length: 7, Attention Mask length: 7, Labels length: 8
Input IDs length: 31, Attention Mask length: 31, Labels length: 27


In [None]:
train_encoded = encode(src_df=train_df, tokenizer=tokenizer)
val_encoded = encode(src_df=val_df, tokenizer=tokenizer)

Encoding documents:   0%|          | 0/70000 [00:00<?, ?it/s]

Encoding documents:   0%|          | 0/15000 [00:00<?, ?it/s]

<h1> Load the T5-model </h1>

In [None]:
# Load the pretrained T5 model for sequence-to-sequence tasks
model = AutoModelForSeq2SeqLM.from_pretrained('t5-base')

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# Create a data collator for dynamic padding and batching
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model="t5-base",
    padding='longest',  # Pad to the longest sequence in the batch
    return_tensors='pt'  # Return PyTorch tensors
)

In [None]:
# Use epochs (1-50,000) for small datasets, otherwise use steps
# training_args = Seq2SeqTrainingArguments(
#     output_dir="my_fine_tuned_t5_small_model",
#     eval_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     weight_decay=0.01,
#     save_total_limit=3,
#     num_train_epochs=2,
#     predict_with_generate=True,
#     fp16=True,
#     logging_dir="./logs",
#     report_to="none"
# )

# Use steps (> 50,000) for large datasets
training_args = Seq2SeqTrainingArguments(
    output_dir="my_fine_tuned_t5_small_model",
    evaluation_strategy="steps", # Evaluates the model at specific step intervals
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    save_total_limit=2, # Limits the number of saved models to avoid excessive storage usage
    predict_with_generate=True,
    fp16=True, # Only while using CUDA for mixed-precision training
    gradient_accumulation_steps=6, # Accumulates gradients over multiple steps to handle larger batch sizes
    eval_steps=500, # Runs evaluation every 500 steps
    save_steps=500, # Saves the model every 500 steps
    load_best_model_at_end=True, # Loads the best model checkpoint at the end of training
    logging_dir="/logs", # Directory for storing logs
    report_to="none", # Disables reporting to external services
)



In [None]:
def compute_metrics(eval_pred: tuple):
    """
    Compute evaluation metrics for model predictions.
    """
    predictions, labels = eval_pred

    # Decode model predictions into text
    pred_decoded = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 with the tokenizer's pad token ID for correct decoding
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Decode reference labels into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute ROUGE score between predictions and ground truth labels
    result = rouge.compute(predictions=pred_decoded, references=decoded_labels, use_stemmer=True)

    # Compute the length of each prediction (excluding padding tokens)
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]

    # Add the average generated sequence length to the result
    result["gen_len"] = np.mean(prediction_lens)

    # Round metric values to 4 decimal places for readability
    return {k: round(v, 4) for k, v in result.items()}


In [None]:
# Load the ROUGE evaluation metric
rouge = evaluate.load("rouge")

# Initialize the Seq2SeqTrainer for fine-tuning
trainer = Seq2SeqTrainer(
    model=model,  # Pretrained model
    args=training_args,  # Training configurations
    train_dataset=train_encoded,  # Training dataset
    eval_dataset=val_encoded,  # Evaluation dataset
    tokenizer=tokenizer,  # Tokenizer for processing text
    data_collator=data_collator,  # Handles padding and batching
    compute_metrics=compute_metrics,  # Function to compute evaluation metrics
)

# Start training the model
trainer.train()

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

  trainer = Seq2SeqTrainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
500,0.7499,0.6071,0.7175,0.6119,0.7098,0.7098,18.0035


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=729, training_loss=0.7281213244947059, metrics={'train_runtime': 1907.9124, 'train_samples_per_second': 36.689, 'train_steps_per_second': 0.382, 'total_flos': 7701338202439680.0, 'train_loss': 0.7281213244947059, 'epoch': 0.9997714285714285})

In [None]:
# Save the trained model to the 'correction' directory
trainer.save_model('correction')

# Model Testing

<h1> Case study 1: Normal Text </h1>

1. Short text

In [None]:
# Set the device to CUDA for faster inference
torch_device = 'cuda'

def correct_grammar(input_text, num_return_sequences):
    """
    Generates corrected versions of the input text using the fine-tuned model.
    """
    # Tokenize and prepare input text for the model
    batch = tokenizer(
        [input_text],
        truncation=True,
        padding='max_length',
        max_length=512,
        return_tensors="pt"
    ).to(torch_device)

    # Generate corrected text sequences
    translated = model.generate(
        **batch,
        max_length=512,
        num_beams=4,  # Beam search for better quality
        num_return_sequences=num_return_sequences,  # Number of output variations
        temperature=1.5  # Higher temperature increases diversity
    )

    # Decode and return the generated text
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return tgt_text

In [None]:
# Example incorrect sentence
text = 'They could culture more land and grows food a lot more.'

# Generate 3 possible grammar-corrected versions
print(correct_grammar(text, num_return_sequences=3))



['They could cultivate more land and grow food a lot more.', 'They could culture more land and grow food a lot more.', 'They could culture more land and produce food a lot more.']


In [None]:
text = 'he are an teachers'
print(correct_grammar(text, num_return_sequences=1))

['He is a teacher.']


In [None]:
text = """These art forms start with sologans to find the talent, but from what I’ve observed, they just entertaiment. """

print(correct_grammar(text, num_return_sequences= 1))

['These art forms start with sologans to find the talent, but from what I’ve observed, they just entertainment.']


2. Long text

In [None]:
torch_device = 'cuda'

def correct_grammar(input_text, num_return_sequences):
    """
    Corrects grammar in long texts by splitting them into smaller chunks
    and processing them individually.
    """

    def split_text(text, max_length):
        """Splits text into chunks, ensuring each chunk does not exceed max_length."""
        words = text.split()
        chunks = []
        current_chunk = []

        for word in words:
            # Check if adding the next word exceeds max_length
            if len(" ".join(current_chunk + [word])) <= max_length:
                current_chunk.append(word)
            else:
                chunks.append(" ".join(current_chunk))
                current_chunk = [word]

        if current_chunk:
            chunks.append(" ".join(current_chunk))  # Add the last chunk
        return chunks

    # Split the input text into manageable chunks
    chunks = split_text(input_text, max_length=512)

    # Process each chunk and store the results
    all_translated = []

    for chunk in chunks:
        batch = tokenizer(
            [chunk], truncation=True, padding='max_length',
            max_length=512, return_tensors="pt"
        ).to(torch_device)

        translated = model.generate(
            **batch, max_length=512, num_beams=4,
            num_return_sequences=num_return_sequences, temperature=1.5
        )

        tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
        all_translated.append(tgt_text[0])  # Assuming one output per chunk

    # Combine all corrected chunks into a complete text
    return " ".join(all_translated)


# Example text with grammar errors
text =  """Today gift shows are popular in many countries, and purpose of these shows finds talented people, and help them to introduce themselves to each other. Actually, many people now watch this shows, and during this years find more fans that cause increase the Viewer, and many sponsors Keen on for sponsoring this shows, because gift shows has benefits for them, and this programs convert to tools that earn money, and present their services.

Firstly, result this programme has a massive effect on the society, because many people get a chance to represent their gift. On the other hand, many people have gift, but they do not know, so they have the opportunity to find their gift, and encourage them to follow their interests.

Secondly, many audiences, and viewers watch this shows, so it is a big chance for companies by sponsoring in this program. They can find new customers and introduce their services to each other. For instance, they commercials between the shows certify this issue. Furthermore, TV is one of the tools that entertain people, although the target finds gift, so part of this shows for entertaining people.

As a result, the aim of producing this shows impressive, so part of the society following this shows for entertaining, and the part of the people persuade to find their talents. In fact, this topic has two side that everyone can according to own opinion.
"""

# Generate corrected version of the text
print(correct_grammar(text, num_return_sequences=1))

Today gift shows are popular in many countries, and purpose of these shows is to find talented people, and help them to introduce themselves to each other. Actually, many people now watch this shows, and during this years find more fans, and many sponsors Keen on for sponsoring this shows, because gift shows have benefits for them, and this programs convert to tools that earn money, and present their services. Firstly, the result this programme has a massive effect on the society, and it has a massive effect on the society. On the other hand, many people get a chance to represent their gifts. On the other hand, many people have gift, but they do not know, so they have the opportunity to find their gift, and encourage them to follow their interests. Secondly, many audiences, and viewers watch this shows, so it is a big chance for companies to sponsor in this program. They can find new customers and introduce their services to each other. For instance, they commercials between the shows 

<h1> Case study 2: Punctuation and contraction errors </h1>

- Install Library to help fix simple contractions

In [None]:
!pip install -q contractions

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/289.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/118.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.3/118.3 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import contractions

# Capitalize the first letter of each sentence
def capitalize_first_letter(text):
    sentences = re.split(r'([.!?])', text)
    capitalized_sentences = []
    for sentence in sentences:
        sentence = sentence.strip()  # Remove extra spaces
        if sentence:
            capitalized_sentences.append(sentence[0].upper() + sentence[1:] if sentence else '')
        else:
            capitalized_sentences.append('')

    # Reconstruct the text into a complete paragraph
    return ' '.join(capitalized_sentences).replace(' ,', ',').replace(',', ', ').replace(' .', '.').replace(' ?', '?').replace(' !', '!')


# Function to split text into smaller chunks
def split_text(text, max_length):
    words = text.split()
    chunks = []
    current_chunk = []

    for word in words:
        if len(" ".join(current_chunk + [word])) <= max_length:
            current_chunk.append(word)
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]

    if current_chunk:
        chunks.append(" ".join(current_chunk))  # Add the remaining chunk

    return chunks


# Correct Grammar
def correct_grammar(input_text, num_return_sequences):
    # Normalize punctuation - Remove excessive duplicate marks
    normalized_text = re.sub(r',\s*,+', ',', input_text)
    normalized_text = re.sub(r'\.\s*\.+', '.', normalized_text)
    normalized_text = re.sub(r'\!\s*\!+', '.', normalized_text)
    normalized_text = re.sub(r'\?\s*\?+', '.', normalized_text)

    # Expand contractions
    expanded_text = contractions.fix(normalized_text)

    # Split text into smaller chunks within max_length limit
    chunks = split_text(expanded_text, max_length=512)
    corrected_texts = []

    for chunk in chunks:
        # Tokenize and encode
        batch = tokenizer([chunk], truncation=True, padding='max_length', max_length=512, return_tensors="pt").to(torch_device)

        # Generate corrected text
        translated = model.generate(
            **batch,
            max_length=512,
            num_beams=4,
            num_return_sequences=num_return_sequences,
            temperature=1.5
        )
        tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)

        for text in tgt_text:
            text = text.lower()
            text = capitalize_first_letter(text)
            corrected_texts.append(text)

    # Merge all chunks into a complete text
    return " ".join(corrected_texts)

In [None]:
# Example usage
input_text = """I can't believe it's already December,time flies so fast! I haven't seen him since last year, he probably won't come to the party.Btw,Do you think she is going to make it? I don't know, but she's been really busy lately, so maybe she won't. Also, I heard that they're planning a surprise for us, but I don't know if it'll be a good idea... What do you think about that? I think it's gonna be great, though! I just hope everyone can come."""
num_return_sequences = 1

corrected_text = correct_grammar(input_text, num_return_sequences=num_return_sequences)

# Display results
print(f"Corrected Text: {corrected_text}")

Corrected Text: I cannot believe it is already december, time flies so fast! I have not seen him since last year,  he probably won't come to the party. By the way, do you think she is going to make it? I do not know,  but she has been really busy lately,  so maybe she will not. Also,  i heard that they are planning a surprise for us,  but i do not know if it will be a good idea. What do you think about that? I think it is going to be great,  though! 
