In [1]:
!git clone https://github.com/lekshmi-j/grammar-autocorrector.git

Cloning into 'grammar-autocorrector'...
remote: Enumerating objects: 74, done.[K
remote: Counting objects: 100% (74/74), done.[K
remote: Compressing objects: 100% (57/57), done.[K
remote: Total 74 (delta 37), reused 46 (delta 13), pack-reused 0 (from 0)[K
Receiving objects: 100% (74/74), 124.46 KiB | 11.31 MiB/s, done.
Resolving deltas: 100% (37/37), done.


In [2]:

%cd grammar-autocorrector

/content/grammar-autocorrector


Build evaluation dataset

In [4]:
originals = [
    "She don't like apples",
    "He go to market yesterday"
]

golds = [
    "She doesn't like apples",
    "He went to the market yesterday"
]


In [5]:
from src.rules import subject_verb_agreement_rule
from src.transformer_corrector import transformer_correct
import spacy

nlp = spacy.load("en_core_web_sm")

rule_outputs = []
ml_rule_outputs = []   # if you have ML-gated rules
transformer_outputs = []

for sent in originals:
    doc = nlp(sent)

    rule_out = subject_verb_agreement_rule(doc)
    rule_outputs.append(rule_out if rule_out else sent)

    # placeholder if ML gating not added yet
    ml_rule_outputs.append(rule_out if rule_out else sent)

    transformer_outputs.append(transformer_correct(sent))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [7]:
import pandas as pd


# Example structure
# original: incorrect sentence
# gold: human corrected sentence
# rule: rule-based output
# ml_rule: ML-gated rule output
# transformer: transformer output


eval_df = pd.DataFrame({
"original": originals,
"gold": golds,
"rule": rule_outputs,
"ml_rule": ml_rule_outputs,
"transformer": transformer_outputs
})


# Save for reproducibility
eval_df.to_csv("data/processed/eval_results.csv", index=False)

Automatic Evaluation Metrics

Metric 1 — Exact Sentence Match
Concept

Strict correctness: output must exactly equal gold sentence.



In [9]:
def exact_match(preds, golds):
  matches = [p.strip() == g.strip() for p, g in zip(preds, golds)]
  return sum(matches) / len(matches)

Metric 2 — Word Error Rate (WER)
Concept

Measures number of edits (insertions, deletions, substitutions) required.

WER = (S + D + I) / N

In [11]:
!pip install jiwer


Collecting jiwer
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading jiwer-4.0.0-py3-none-any.whl (23 kB)
Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m89.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-4.0.0 rapidfuzz-3.14.3


In [12]:
import jiwer


def compute_wer(preds, golds):
  return jiwer.wer(golds, preds)

Metric 3 — BLEU Score

In [13]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

smooth = SmoothingFunction().method1

def sentence_bleu_score(preds, golds):
    scores = []
    for pred, gold in zip(preds, golds):
        score = sentence_bleu(
            [gold.split()],
            pred.split(),
            smoothing_function=smooth
        )
        scores.append(score)
    return sum(scores) / len(scores)


In [14]:
bleu_rule = sentence_bleu_score(eval_df["rule"], eval_df["gold"])
bleu_ml   = sentence_bleu_score(eval_df["ml_rule"], eval_df["gold"])
bleu_tr   = sentence_bleu_score(eval_df["transformer"], eval_df["gold"])


In [15]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

smooth = SmoothingFunction().method1

def corpus_bleu_score(preds, golds):
    references = [[g.split()] for g in golds]
    hypotheses = [p.split() for p in preds]
    return corpus_bleu(references, hypotheses, smoothing_function=smooth)


In [16]:
bleu_rule = corpus_bleu_score(eval_df["rule"], eval_df["gold"])
bleu_ml   = corpus_bleu_score(eval_df["ml_rule"], eval_df["gold"])
bleu_tr   = corpus_bleu_score(eval_df["transformer"], eval_df["gold"])


In [17]:
!pip install rouge-score


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=2d06098acd92694fd72f16ffca6f9188a4b3e94f3a9609d754b0c584fa8bf43a
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [18]:
from rouge_score import rouge_scorer
import numpy as np

def rouge_scores(preds, golds):
    scorer = rouge_scorer.RougeScorer(
        ["rouge1", "rouge2", "rougeL"],
        use_stemmer=True
    )

    r1, r2, rl = [], [], []

    for pred, gold in zip(preds, golds):
        scores = scorer.score(gold, pred)
        r1.append(scores["rouge1"].fmeasure)
        r2.append(scores["rouge2"].fmeasure)
        rl.append(scores["rougeL"].fmeasure)

    return {
        "ROUGE-1": np.mean(r1),
        "ROUGE-2": np.mean(r2),
        "ROUGE-L": np.mean(rl)
    }


In [19]:
rouge_rule = rouge_scores(eval_df["rule"], eval_df["gold"])
rouge_ml   = rouge_scores(eval_df["ml_rule"], eval_df["gold"])
rouge_tr   = rouge_scores(eval_df["transformer"], eval_df["gold"])


In [20]:
import pandas as pd

results = pd.DataFrame({
    "Method": ["Rule-based", "ML + Rules", "Transformer"],
    "BLEU": [bleu_rule, bleu_ml, bleu_tr],
    "ROUGE-L": [
        rouge_rule["ROUGE-L"],
        rouge_ml["ROUGE-L"],
        rouge_tr["ROUGE-L"]
    ]
})

results


Unnamed: 0,Method,BLEU,ROUGE-L
0,Rule-based,0.079878,0.763636
1,ML + Rules,0.079878,0.763636
2,Transformer,0.078679,0.613636


In [21]:
metrics = {}


metrics["rule"] = {
"exact": exact_match(eval_df["rule"], eval_df["gold"]),
"wer": compute_wer(eval_df["rule"], eval_df["gold"]),
"bleu": bleu_score(eval_df["rule"], eval_df["gold"])
}


metrics["ml_rule"] = {
"exact": exact_match(eval_df["ml_rule"], eval_df["gold"]),
"wer": compute_wer(eval_df["ml_rule"], eval_df["gold"]),
"bleu": bleu_score(eval_df["ml_rule"], eval_df["gold"])
}


metrics["transformer"] = {
"exact": exact_match(eval_df["transformer"], eval_df["gold"]),
"wer": compute_wer(eval_df["transformer"], eval_df["gold"]),
"bleu": bleu_score(eval_df["transformer"], eval_df["gold"])
}


metrics

ValueError: input 0            She doesn't like apples
1    He went to the market yesterday
Name: gold, dtype: object was expected to be a string or list of strings