In [1]:
!git clone https://github.com/lekshmi-j/grammar-autocorrector.git

fatal: destination path 'grammar-autocorrector' already exists and is not an empty directory.


In [2]:
%cd grammar-autocorrector

/content/grammar-autocorrector


In [3]:
!pip install datasets pandas




In [4]:
import pandas as pd
from datasets import load_dataset


In [5]:
dataset = load_dataset("jfleg", split="validation")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
df = pd.DataFrame(dataset)


**Build a training DataFrame**

In [7]:
import pandas as pd

data = []

for i in range(len(df)):
    data.append({"sentence": df.loc[i, "sentence"], "label": 0})
    data.append({"sentence": df.loc[i, "corrections"][0], "label": 1})

train_df = pd.DataFrame(data)
train_df.head()


Unnamed: 0,sentence,label
0,So I think we can not live if old people could...,0
1,So I think we would not be alive if our ancest...,1
2,For not use car .,0
3,Not for use with a car .,1
4,Here was no promise of morning except that we ...,0


**Feature engineering (MOST IMPORTANT PART)**

**Feature 1: Sentence length**

In [8]:
def get_sentence_length(sentence):
    """
    Counts the number of words in a sentence.

    Parameters:
    sentence (str): A sentence from the dataset

    Returns:
    int: Number of words in the sentence
    """
    words = sentence.split()   # split sentence into words
    return len(words)          # count words


In [9]:
train_df["sent_len"] = train_df["sentence"].apply(get_sentence_length)


In [10]:
import nltk
nltk.download('punkt_tab')
nltk.download("averaged_perceptron_tagger_eng")




[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [11]:
from nltk import pos_tag
from nltk.tokenize import word_tokenize


def pos_sequence(sentence):
    return " ".join([tag for _, tag in pos_tag(word_tokenize(sentence))])

train_df["pos_seq"] = train_df["sentence"].apply(pos_sequence)

**Feature 3: Word n-grams (bag of words)**

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

word_vectorizer = CountVectorizer(
    ngram_range=(1, 2),
    max_features=3000
)

**Feature 4: POS n-grams**

In [13]:
pos_vectorizer = CountVectorizer(
    ngram_range=(2, 3),
    max_features=1000
)


**Combine features (feature union)**

In [14]:
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import FunctionTransformer

def extract_sentence_length(df):
    """
    Extracts sentence length as a numeric feature.

    Parameters:
    df (DataFrame): Input data containing 'sent_len' column

    Returns:
    numpy array of shape (n_samples, 1)
    """
    return df["sent_len"].values.reshape(-1, 1)
text_features = FeatureUnion([
    # 1️⃣ Word-level n-gram features (from sentence text)
    ("word_ngrams", word_vectorizer),

    # 2️⃣ POS-tag n-gram features (from POS-tagged text)
    ("pos_ngrams", pos_vectorizer),

    # 3️⃣ Sentence length feature
    ("sentence_length", FunctionTransformer(
        extract_sentence_length,
        validate=False
    ))
])



**Train ML models**

In [15]:
print(train_df.columns)


Index(['sentence', 'label', 'sent_len', 'pos_seq'], dtype='object')


In [16]:
train_df["pos_text"] = train_df["pos_seq"]


In [17]:
print(train_df[["sentence", "pos_text"]].head())


                                            sentence  \
0  So I think we can not live if old people could...   
1  So I think we would not be alive if our ancest...   
2                                 For not use car .    
3                          Not for use with a car .    
4  Here was no promise of morning except that we ...   

                                            pos_text  
0  RB PRP VBP PRP MD RB VB IN JJ NNS MD RB VB NNS...  
1  RB PRP VBP PRP MD RB VB JJ IN PRP$ NNS VBD RB ...  
2                                      IN RB JJ NN .  
3                                RB IN NN IN DT NN .  
4  RB VBD DT NN IN NN IN IN PRP VBD RP IN DT NNS ...  


In [18]:
#Train–test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    train_df,
    train_df["label"],
    test_size=0.2,
    random_state=42
)


In [19]:
word_vectorizer.fit(X_train["sentence"])


In [20]:
pos_vectorizer.fit(X_train["pos_text"])


In [21]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import CountVectorizer
from nltk import pos_tag, word_tokenize

def pos_sequence(series):
    return series.apply(
        lambda x: " ".join(tag for _, tag in pos_tag(word_tokenize(x)))
    )

def sentence_length(series):
    return series.apply(lambda x: len(x.split())).values.reshape(-1, 1)

features = ColumnTransformer(
    transformers=[
        ("word_ngrams",
         CountVectorizer(ngram_range=(1, 2), max_features=3000),
         "sentence"),

        ("pos_ngrams",
         Pipeline([
             ("pos", FunctionTransformer(pos_sequence, validate=False)),
             ("vec", CountVectorizer(ngram_range=(2, 3), max_features=1000))
         ]),
         "sentence"),

        ("length",
         FunctionTransformer(sentence_length, validate=False),
         "sentence")
    ]
)


In [23]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

lr_model = Pipeline([
    ("features", features),
    ("clf", LogisticRegression(max_iter=1000))
])

lr_model.fit(train_df[["sentence"]], train_df["label"])


In [None]:
# text_features = ColumnTransformer(
#     transformers=[
#         # Word n-grams from raw sentence text
#         ("word_ngrams", word_vectorizer, "sentence"),

#         # POS n-grams from POS sequence
#         ("pos_ngrams", pos_vectorizer, "pos_seq"),

#         # Sentence length as numeric feature
#         ("sent_len", FunctionTransformer(
#             lambda x: x.values.reshape(-1, 1),
#             validate=False
#         ), ["sent_len"])
#     ]
# )


In [None]:
# lr_model = Pipeline([
#     ("features", text_features),
#     ("clf", LogisticRegression(max_iter=1000))
# ])


In [None]:
lr_model.fit(X_train, y_train)


In [None]:
print(X_train[["sentence", "pos_seq", "sent_len"]].head())


## Debugging Note: Empty Vocabulary Error

While training the baseline Logistic Regression model, I encountered the following error:

ValueError: empty vocabulary; perhaps the documents only contain stop words


Initially, this looked like a text preprocessing issue, but the dataset itself was fine. The problem turned out to be in how the feature pipeline was constructed.

---

## What Went Wrong

The model uses multiple feature types:

- Word n-grams from the `sentence` column  
- POS n-grams from the `pos_seq` column  
- Sentence length from the `sent_len` column  

I originally combined these features using `FeatureUnion`. However, `FeatureUnion` passes the entire input object to every transformer. Since the input was a pandas DataFrame, the text vectorizers did not know which column to read from. As a result, they received invalid input and produced an empty vocabulary.

The error message was misleading—the issue was not stop words, but incorrect feature wiring.

---

## How I Fixed It

I replaced `FeatureUnion` with `ColumnTransformer`. This allows each transformer to explicitly specify which DataFrame column it should operate on.

```python
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

text_features = ColumnTransformer(
    transformers=[
        ("word_ngrams", word_vectorizer, "sentence"),
        ("pos_ngrams", pos_vectorizer, "pos_seq"),
        ("sent_len", FunctionTransformer(
            lambda x: x.values.reshape(-1, 1),
            validate=False
        ), ["sent_len"])
    ]
)


FeatureUnion should be used only when all transformers operate on the same input.

When working with DataFrames and multiple feature columns, ColumnTransformer is the correct choice.

An “empty vocabulary” error often points to incorrect input being passed to a vectorizer, not an issue with the text itself.

In [None]:
from sklearn.metrics import classification_report

y_pred = lr_model.predict(X_test)
print(classification_report(y_test, y_pred))


In [None]:
from sklearn.naive_bayes import MultinomialNB

nb_model = Pipeline([
    ("features", text_features),
    ("clf", MultinomialNB())
])

nb_model.fit(X_train, y_train)


In [24]:
mkdir models


mkdir: cannot create directory ‘models’: File exists


In [25]:
import os
os.makedirs("models", exist_ok=True)


In [26]:
lr_model.predict(X_test[:5])


array([0, 1, 1, 0, 1])

In [27]:
import joblib

joblib.dump(lr_model, "models/grammar_detector.joblib")


['models/grammar_detector.joblib']

In [28]:
import os
os.path.exists("models/grammar_detector.joblib")


True

In [29]:
!pip install pyspellchecker




In [30]:
from src.corrector import correct_sentence

print(correct_sentence("He go to market"))
print(correct_sentence("She went to school"))


detect here
spelling here
grammar here
he gos to market
detect here
She went to school
