In [1]:
!git clone https://github.com/lekshmi-j/grammar-autocorrector.git

Cloning into 'grammar-autocorrector'...
remote: Enumerating objects: 16, done.[K
remote: Counting objects: 100% (16/16), done.[K
remote: Compressing objects: 100% (13/13), done.[K
remote: Total 16 (delta 2), reused 10 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (16/16), done.
Resolving deltas: 100% (2/2), done.


In [2]:
%cd grammar-autocorrector

/content/grammar-autocorrector


# Phase 1: Text & Error Analysis

Objective:
Understand real-world grammatical errors and analyze them using
tokenization, POS tagging, and dependency parsing.


In [3]:
!pip install datasets pandas




In [4]:
import pandas as pd
from datasets import load_dataset


In [6]:
dataset = load_dataset("jfleg", split="validation")


In [7]:
df = pd.DataFrame(dataset)


In [8]:
df.columns


Index(['sentence', 'corrections'], dtype='object')

In [9]:
#Inspect Raw Data
df.head()


Unnamed: 0,sentence,corrections
0,So I think we can not live if old people could...,[So I think we would not be alive if our ances...
1,For not use car .,"[Not for use with a car . , Do not use in the ..."
2,Here was no promise of morning except that we ...,"[Here was no promise of morning , except that ..."
3,Thus even today sex is considered as the least...,"[Thus , even today , sex is considered as the ..."
4,image you salf you are wark in factory just to...,[Imagine yourself you are working in factory j...


In [10]:
for i in range(5):
    print(f"❌ Incorrect: {df.loc[i, 'sentence']}")
    print(f"✔ Corrected: {df.loc[i, 'corrections'][0]}")
    print("-" * 50)


❌ Incorrect: So I think we can not live if old people could not find siences and tecnologies and they did not developped . 
✔ Corrected: So I think we would not be alive if our ancestors did not develop sciences and technologies . 
--------------------------------------------------
❌ Incorrect: For not use car . 
✔ Corrected: Not for use with a car . 
--------------------------------------------------
❌ Incorrect: Here was no promise of morning except that we looked up through the trees we saw how low the forest had swung . 
✔ Corrected: Here was no promise of morning , except that we looked up through the trees , and we saw how low the forest had swung . 
--------------------------------------------------
❌ Incorrect: Thus even today sex is considered as the least important topic in many parts of India . 
✔ Corrected: Thus , even today , sex is considered as the least important topic in may parts of India . 
--------------------------------------------------
❌ Incorrect: image you sal

Analysing sentence lengths

In [11]:
def get_sentence_length(sentence):
    # Split the sentence into words using spaces
    words = sentence.split()

    # Count how many words are there
    return len(words)


In [12]:
df["sentence_length"] = df["sentence"].apply(get_sentence_length)


In [13]:
df["sentence_length"].describe()


Unnamed: 0,sentence_length
count,755.0
mean,18.556291
std,10.142248
min,0.0
25%,12.0
50%,16.0
75%,23.0
max,80.0


In [14]:
#Taking Random Samples
df.sample(5, random_state=42)[["sentence", "corrections"]]


Unnamed: 0,sentence,corrections
291,For example we can see on the discovery channe...,[For example we can see on the Discovery Chann...
536,People use the public water to drink water .,"[People use public water to drink . , People u..."
39,Very soon they will run out at the current rat...,[They will run out very soon at the current ra...
77,It is obvious that after returning i was tired...,[It was obvious after returning that I was tir...
493,And there is a lot of critics concerned that t...,[And there are a lot of critics concerned that...


### Dataset Observations

- Sentences often contain multiple grammatical errors
- Corrections may involve tense, agreement, or article fixes
- Some corrections rewrite phrasing slightly (noise)


In [19]:
SAMPLE_SIZE = 30


In [20]:
analysis_df = df.sample(
    n=SAMPLE_SIZE,      # number of rows to select
    random_state=42     # ensures the same random rows every time
)


In [21]:
analysis_df = analysis_df.reset_index(drop=True)


In [22]:
analysis_df.head()


Unnamed: 0,sentence,corrections,sentence_length
0,For example we can see on the discovery channe...,[For example we can see on the Discovery Chann...,22
1,People use the public water to drink water .,"[People use public water to drink . , People u...",9
2,Very soon they will run out at the current rat...,[They will run out very soon at the current ra...,13
3,It is obvious that after returning i was tired...,[It was obvious after returning that I was tir...,18
4,And there is a lot of critics concerned that t...,[And there are a lot of critics concerned that...,22


In [24]:
for idx, row in analysis_df.iterrows():
    incorrect_sentence = row["sentence"]
    corrected_sentence = row["corrections"][0]

    print(f"{idx + 1}. ❌ {incorrect_sentence}")
    print(f"   ✔ {corrected_sentence}")
    print("-" * 60)


1. ❌ For example we can see on the discovery channel in wild life many people are hobbies to learn from the animals . 
   ✔ For example we can see on the Discovery Channel in the wild many people enjoy learning about the animals . 
------------------------------------------------------------
2. ❌ People use the public water to drink water . 
   ✔ People use public water to drink . 
------------------------------------------------------------
3. ❌ Very soon they will run out at the current rate of utilisation . 
   ✔ They will run out very soon at the current rate of utilization . 
------------------------------------------------------------
4. ❌ It is obvious that after returning i was tired and the night is meant to sleep ! . 
   ✔ It was obvious after returning that I was tired , and the night is meant to sleep ! 
------------------------------------------------------------
5. ❌ And there is a lot of critics concerned that the reqired testing is so long that declined the valueable fo

## Error Categories (Initial)

- SPELL: Incorrect spelling
- SVA: Subject–Verb Agreement errors
- ARTICLE: Incorrect or missing a/an/the
- TENSE: Wrong verb tense
- VERB FORM: Incorrect verb form (gerund, infinitive, auxiliary)


In [25]:
annotations = []


In [26]:
annotations = []

# 1
annotations.append({
    "sentence": analysis_df.loc[0, "sentence"],
    "error_types": ["SPELL", "ARTICLE", "VERB FORM"]
})

# 2
annotations.append({
    "sentence": analysis_df.loc[1, "sentence"],
    "error_types": ["ARTICLE", "VERB FORM"]
})

# 3
annotations.append({
    "sentence": analysis_df.loc[2, "sentence"],
    "error_types": ["SPELL"]
})

# 4
annotations.append({
    "sentence": analysis_df.loc[3, "sentence"],
    "error_types": ["TENSE", "SVA", "ARTICLE"]
})

# 5
annotations.append({
    "sentence": analysis_df.loc[4, "sentence"],
    "error_types": ["SPELL", "SVA", "VERB FORM"]
})

# 6
annotations.append({
    "sentence": analysis_df.loc[5, "sentence"],
    "error_types": ["ARTICLE", "PUNCTUATION"] if False else ["ARTICLE"]
})

# 7 (no grammatical error – identical)
annotations.append({
    "sentence": analysis_df.loc[6, "sentence"],
    "error_types": []
})

# 8
annotations.append({
    "sentence": analysis_df.loc[7, "sentence"],
    "error_types": ["SPELL"]
})

# 9
annotations.append({
    "sentence": analysis_df.loc[8, "sentence"],
    "error_types": ["VERB FORM"]
})

# 10
annotations.append({
    "sentence": analysis_df.loc[9, "sentence"],
    "error_types": ["PREPOSITION"] if False else ["ARTICLE"]
})

# 11
annotations.append({
    "sentence": analysis_df.loc[10, "sentence"],
    "error_types": ["SPELL", "VERB FORM"]
})

# 12
annotations.append({
    "sentence": analysis_df.loc[11, "sentence"],
    "error_types": ["VERB FORM"]
})

# 13
annotations.append({
    "sentence": analysis_df.loc[12, "sentence"],
    "error_types": ["SPELL"]
})

# 14 (correct sentence)
annotations.append({
    "sentence": analysis_df.loc[13, "sentence"],
    "error_types": []
})

# 15
annotations.append({
    "sentence": analysis_df.loc[14, "sentence"],
    "error_types": ["VERB FORM", "SPELL"]
})

# 16 (correct sentence)
annotations.append({
    "sentence": analysis_df.loc[15, "sentence"],
    "error_types": []
})

# 17
annotations.append({
    "sentence": analysis_df.loc[16, "sentence"],
    "error_types": ["SVA", "PREPOSITION"] if False else ["SVA"]
})

# 18
annotations.append({
    "sentence": analysis_df.loc[17, "sentence"],
    "error_types": ["TENSE", "VERB FORM"]
})

# 19
annotations.append({
    "sentence": analysis_df.loc[18, "sentence"],
    "error_types": ["SPELL", "ARTICLE"]
})

# 20
annotations.append({
    "sentence": analysis_df.loc[19, "sentence"],
    "error_types": ["SPELL", "VERB FORM"]
})

# 21
annotations.append({
    "sentence": analysis_df.loc[20, "sentence"],
    "error_types": ["VERB FORM"]
})

# 22
annotations.append({
    "sentence": analysis_df.loc[21, "sentence"],
    "error_types": ["ARTICLE", "SVA"]
})

# 23
annotations.append({
    "sentence": analysis_df.loc[22, "sentence"],
    "error_types": ["SPELL", "SVA"]
})

# 24
annotations.append({
    "sentence": analysis_df.loc[23, "sentence"],
    "error_types": ["SPELL", "VERB FORM"]
})

# 25
annotations.append({
    "sentence": analysis_df.loc[24, "sentence"],
    "error_types": ["SVA", "ARTICLE"]
})

# 26
annotations.append({
    "sentence": analysis_df.loc[25, "sentence"],
    "error_types": ["ARTICLE"]
})

# 27
annotations.append({
    "sentence": analysis_df.loc[26, "sentence"],
    "error_types": ["VERB FORM"]
})

# 28
annotations.append({
    "sentence": analysis_df.loc[27, "sentence"],
    "error_types": ["PRONOUN"] if False else ["VERB FORM"]
})

# 29
annotations.append({
    "sentence": analysis_df.loc[28, "sentence"],
    "error_types": ["VERB FORM", "ARTICLE"]
})

# 30
annotations.append({
    "sentence": analysis_df.loc[29, "sentence"],
    "error_types": ["SPELL", "SVA"]
})


In [27]:
#Convert annotations to dataframe
error_df = pd.DataFrame(annotations)
error_df


Unnamed: 0,sentence,error_types
0,For example we can see on the discovery channe...,"[SPELL, ARTICLE, VERB FORM]"
1,People use the public water to drink water .,"[ARTICLE, VERB FORM]"
2,Very soon they will run out at the current rat...,[SPELL]
3,It is obvious that after returning i was tired...,"[TENSE, SVA, ARTICLE]"
4,And there is a lot of critics concerned that t...,"[SPELL, SVA, VERB FORM]"
5,"In my opinion , this statement is groundless a...",[ARTICLE]
6,It is more exciting and memorable .,[]
7,"In deed , they can be refuced .",[SPELL]
8,Successful people have to do things stablely a...,[VERB FORM]
9,"For example , my parents went to a group tour ...",[ARTICLE]


In [28]:
from collections import Counter

all_errors = []
for errs in error_df["error_types"]:
    all_errors.extend(errs)

Counter(all_errors)


Counter({'SPELL': 12, 'ARTICLE': 10, 'VERB FORM': 14, 'TENSE': 2, 'SVA': 7})

## Observations from Manual Error Analysis

- Many sentences contain multiple grammatical errors
- Tense errors often co-occur with time expressions
- Article errors require noun-level context
- Verb form errors are distinct from tense errors
- Over-correction risk is high if context is ignored


In [31]:
error_df.to_csv("data/processed/manual_error_annotations.csv", index=False)
