## Errors

Checking for grammar and spelling mistakes within articles

In [2]:
import language_tool_python as ltp
import spacy

In [3]:
ERROR_LIMIT = 0.2

if not 0 < ERROR_LIMIT <= 1:
    raise ValueError("ERROR_LIMIT must be greater than 0 and lower or equal to 1")

In [4]:
lang_tool = ltp.LanguageTool('en-US')
nlp = spacy.load("en_core_web_sm")

Downloading LanguageTool 5.7: 100%|██████████| 225M/225M [00:41<00:00, 5.46MB/s] 
Unzipping C:\Users\joelm\AppData\Local\Temp\tmpt31osged.zip to C:\Users\joelm\.cache\language_tool_python.
Downloaded https://www.languagetool.org/download/LanguageTool-5.7.zip to C:\Users\joelm\.cache\language_tool_python.


In [7]:
def evaluate_errors(content: str):
    """Evaluates errors of content.
    
    Looks up spelling and grammar errors of the content, scales value by overall word count.

    Between 0 unique errors up to the error limit set above.

    Value near zero means a lot of errors, value nearer to 1 means no errors.
    """

    matches = lang_tool.check(content)

    doc = nlp(content)
    entities = ["PERSON", "NORP", "FAC", "FACILITY", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "LAW"]
    names = set([ent.text.strip() for end in doc.ents if ent.label_ in entities])

    matches_to_ignore = 0
    unknown_words = []

    for match in matches:
        if (match.ruleId in ["EN_QUOTES", "DASH_RULE", "EXTREME_ADJECTIVES", "MONTH_OF_XXXX","ENGLISH_WORD_REPEAT_BEGINNING_RULE"]
        or match.category == "REDUNDANCY" or "is British English" in match.message
        or match.matchedText in unknown_words or ("Possible spelling mistake" in match.message and any(match.matchedText in nm.split() for nm in names))):
            matches_to_ignore += 1
        else:
            unknown_words.append(match.matchedText)

    error_score = len(matches) - matches_to_ignore
    word_count = len(content.split())
    subscore = 1 - (error_score / (word_count * ERROR_LIMIT))

    return max(subscore, 0)

In [10]:
errors = []
errors.append(evaluate_errors("The quick brown fox jumped over the lazy dog."))
errors.append(evaluate_errors("The quick bron fox jumped over dog lazy dog"))

print(errors)

[1.0, 0.4444444444444444]
