In [4]:
#!pip install textstat
#!pip install ipywidgets
#!jupyter nbextension enable --py widgetsnbextension

In [3]:
import pandas as pd
import re
import textstat
import ipywidgets as widgets
import itertools
import json
from collections import namedtuple

In [1]:
test_data = (
    "Playing games has always been thought to be important to "
    "the development of well-balanced and creative children; "
    "however, what part, if any, they should play in the lives "
    "of adults has never been researched that deeply. I believe "
    "that playing games is every bit as important for adults "
    "as for children. Not only is taking time out to play games "
    "with our children and other adults valuable to building "
    "interpersonal relationships but is also a wonderful way "
    "to release built up tension, don't miss this opportunity, you require it."
    " Children require play time on their own. Directing play reduces"
    " the chance of the child exercising creative thinking, boredom "
    "is a powerful creative driver for both children and adults.\n"
    "When a child asks her mother \"Where do babies come from?\", what "
    "should one reply to her?\n"
    "John.Smith@domain.co.uk\n"
    "First written in 2020.06.01\n"
    "This is a text of a longish - i.e. more than the 14 words - sentence, with a number "
    "of complex terms and structures that makes the sentence very long winded and difficult to"
    "read; length, the use of passive voice and vague filler words should be picked by the  script.\n"
    " Continuing with the discussion,  the authors would like to point out that, in our "
    "opinion, we disagree with the decisions made.  This is another example of sentences "
    "that have no place in modern business communication; times have moved on."
)

In [5]:
def text_stats(input_text):
    test={}
    test["flesch_reading_ease_index"]= textstat.flesch_reading_ease(input_text)
    #0-59 is difficult, 0= Very Confusing - 100= Very Easy    
    test["smog_grade"]= textstat.smog_index(input_text)
    #Grade level needed to read it
    test["flesch_kincaid_grade"]= textstat.flesch_kincaid_grade(input_text)
    #Grade level needed to read it 
    test["coleman_liau_grade"]= textstat.coleman_liau_index(input_text)
    #Grade level needed to read it  
    test["ari_grade"]= textstat.automated_readability_index(input_text)
    #Grade level needed to read it    
    test["dale_chall_score"]=textstat.dale_chall_readability_score(input_text)
    #Readability score, 4.9 or lower=4th grader, 9-9.9=13th-15th (College)    
    test["dale_chall_grade"]=round(4.9*(1-(test["dale_chall_score"]-4.9)/(9.9-4.9))+15.0*((test["dale_chall_score"]-4.9)/(9.9-4.9)),2)
    test["linsear_grade"]= textstat.linsear_write_formula(input_text)
    #Grade level to read it   
    test["gunning_fog_grade"]=textstat.gunning_fog(input_text)
    #Grade level needed to read it  
    test["consensus"] = textstat.text_standard(input_text, float_output=True)
    #Consensus grade 
    return test

In [25]:
import json
with open('rules.json', encoding='ISO-8859-1') as json_file:
    rules = json.load(json_file)
for index, rule in enumerate (rules, start=1):
    print("Rule "+str(index),rule['guidance'])
    test_string= rule['test_string'].encode().decode('unicode_escape')
    if rule["case_sensitive"]:
        matches= re.findall(rule["regex"], test_string )
    else:
        matches= re.findall(rule["regex"], test_string, flags= re.IGNORECASE)
    if len(matches)!= rule['test_hits']:
        print(str(rule['id']) +" failed the test string, pls review it")
    

Rule 1 Avoid filler words
Rule 2 Avoid filler words
Rule 3 Avoid filler words
Rule 4 Avoid filler words
Rule 5 The readers know that, this is an audit report.
Rule 6 The readers know that, this is an audit report.
Rule 7 The readers know that, this is an audit report.
Rule 8 Do you absolutely need to tell the reader this?
Rule 9 Can mean we haven't done enough work to support the conclusion.
Rule 10 Can mean we haven't done enough work to support the conclusion.
Rule 11 Use precise words/numbers. "around" sounds woolly
Rule 12 Subjective statement, word it as an objectively/fact-based statement if you can.
Rule 13 Check if the agreed style is to use 'we' as opposed to the name of the department, etc.
Rule 14 Expand to do not
Rule 15 Expand to does not
Rule 16 Expand to Will not
Rule 17 Expand to Would not
Rule 18 Expand to Cannot
Rule 19 Expand to Have not
Rule 20 Avoid mid-sentence use of however. Better at the start of a sentence to flag where the issue is. Also check if you can avoi

In [20]:
def split_into_sentences(text):
    alphabets= "([A-Za-z])"
    suffixes = "(Inc|Ltd|Jr|Sr|Co)"
    starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
    acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
    prefixes = "(Mr|St|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|Mt)[.]"
    websites = "[.](com|net|org|io|gov|me|edu|co|uk)"
    digits = "([0-9])"
    text = " " + text + "  "
    text = text.replace("\n","<stop> ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    text = re.sub("(\w*)[.](\w+@)","\\1<prd>\\2",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","<excl>\"")
    if "?" in text: text = text.replace("?\"","<quest>\"")
    if "..." in text: text = text.replace("...","<prd><prd><prd>")
    if "e.g." in text: text = text.replace("e.g.","e<prd>g<prd>") 
    if "i.e." in text: text = text.replace("i.e.","i<prd>e<prd>")
    text = re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text)
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    text = text.replace("<excl>","!")
    text = text.replace("<quest>","?")
    text = text.replace("<stop><stop>","<stop>")
    sentences = text.split("<stop>")
    #sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences


In [21]:
def split_into_paragraphs(text):
    paragraphs = text.split("\n")
    paragraphs = [s.strip() for s in paragraphs]
    return paragraphs


In [22]:
ta=widgets.Textarea(
    value=test_data,
    placeholder='Type something',
    description='',
    disabled=False,
    layout=widgets.Layout(width='80%', height='200px'))
display(ta)

Textarea(value='Playing games has always been thought to be important to the development of well-balanced and …

In [23]:
text_to_check=ta.value
text_stats(text_to_check)

{'flesch_reading_ease_index': 47.15,
 'smog_grade': 12.8,
 'flesch_kincaid_grade': 14.7,
 'coleman_liau_grade': 11.56,
 'ari_grade': 18.5,
 'dale_chall_score': 7.68,
 'dale_chall_grade': 10.52,
 'linsear_grade': 12.0,
 'gunning_fog_grade': 15.04,
 'consensus': 12.0}

In [24]:
for index, rule in enumerate (rules, start=1):
    #print("Rule "+str(index))
    if rule["case_sensitive"]:
        matches= re.finditer(rule["regex"], text_to_check)
    else:
        matches= re.finditer(rule["regex"], text_to_check, flags=re.IGNORECASE)
    try:
        first_match= next(matches)
    except StopIteration:  
        pass
        #print("No match!") # action for no match
    else:
        for m in itertools.chain([first_match], matches):
            s= m.start()-20
            e= m.end()+20
            print("Hit:"+m.group(0))
            print("Guidance: " + rule["guidance"])
            print("Context: ..."+ text_to_check[s:e]+ "...")

Hit:don't
Guidance: Expand to do not
Context: ...e built up tension, don't miss this opportuni...
Hit:; however
Guidance: Avoid mid-sentence use of however. Better at the start of a sentence to flag where the issue is. Also check if you can avoid it completely.
Context: ...nd creative children; however, what part, if any,...
Hit:a number
Guidance: Vague, add the actual number
Context: ...ds - sentence, with a numberof complex terms and...
Hit:drive
Guidance: Consultancy lingo
Context: ...a powerful creative driver for both children ...
Hit:; however
Guidance: Do you really need a semicolon?
Context: ...nd creative children; however, what part, if any,...
Hit:; length
Guidance: Do you really need a semicolon?
Context: ...and difficult toread; length, the use of passive...
Hit:; times
Guidance: Do you really need a semicolon?
Context: ...siness communication; times have moved on....
Hit:  
Guidance: Double spaces
Context: ...uld be picked by the  script.
 Continuing ...
Hit:  
Guidance: 

In [12]:
split_into_paragraphs(text_to_check)

["Playing games has always been thought to be important to the development of well-balanced and creative children; however, what part, if any, they should play in the lives of adults has never been researched that deeply. I believe that playing games is every bit as important for adults as for children. Not only is taking time out to play games with our children and other adults valuable to building interpersonal relationships but is also a wonderful way to release built up tension, don't miss this opportunity, you require it. Children require play time on their own. Directing play reduces the chance of the child exercising creative thinking, boredom is a powerful creative driver for both children and adults.",
 'When a child asks her mother "Where do babies come from?", what should one reply to her?',
 'John.Smith@domain.co.uk',
 'First written in 2020.06.01',
 'This is a text of a longish - i.e. more than the 14 words - sentence, with a numberof complex terms and structures that make

In [13]:
split_into_sentences(text_to_check)

['Playing games has always been thought to be important to the development of well-balanced and creative children; however, what part, if any, they should play in the lives of adults has never been researched that deeply.',
 'I believe that playing games is every bit as important for adults as for children.',
 "Not only is taking time out to play games with our children and other adults valuable to building interpersonal relationships but is also a wonderful way to release built up tension, don't miss this opportunity, you require it.",
 'Children require play time on their own.',
 'Directing play reduces the chance of the child exercising creative thinking, boredom is a powerful creative driver for both children and adults.',
 'When a child asks her mother "Where do babies come from?", what should one reply to her?',
 'John.Smith@domain.co.uk',
 'First written in 2020.06.01',
 'This is a text of a longish - i.e. more than the 14 words - sentence, with a numberof complex terms and stru