In [38]:
import spacy
import re
from nltk import sent_tokenize
from lemminflect import getInflection
nlp = spacy.load('en_core_web_trf')

In [4]:
def clean_correction(f):
    def get_correction(*args):
        
        correction = f(*args)
        
        correction = re.sub(' +', ' ', correction)
        correction = re.sub(r'\s(?=[?.;:,!])', r'', correction)
        correction = re.sub(r'(\s(?=n[^o]t\b)|(?<=[Cc]an)\s(?=not))', r'', correction)
        correction = correction.strip()
        if correction: 
            correction = correction[0].upper() + correction[1:] 
        return correction
    
    return get_correction

In [5]:
def find_subj(pred):
    
    # simple cases
    subjects = [child for child in list(pred.children) if child.dep_.startswith(('nsubj', 'csubj'))]
    
    # if predicate is an auxiliary, we want to take subjects of its head
    if pred.dep_.startswith('aux'):
        subjects += [child for child in list(pred.head.children) if child.dep_.startswith(('nsubj', 'csubj'))]
        
    # handling 'there is' and 'there are' cases
    if 'there' in list(i.lower_ for i in pred.children):
        subjects += [child for child in list(pred.children) if child.dep_ == 'attr']
        
    # handling misconstructed indirect questions e.g.
    # "He must understand what is happiness for him"
    # to consider "happiness" the subject not "who"
    if subjects and subjects[0].tag_[0] == 'W' \
                and pred.doc[-1].text != '?' \
                and pred.dep_ != 'relcl' \
                and 'attr' in [t.dep_ for t in pred.children] \
                and pred.nbor().tag_ != 'JJR':
        try: 
            s = next(t for t in pred.children if t.dep_ == 'attr')
            subjects[0] = s
        except StopIteration:
            pass
        
        
    # handling conjuncts: multiple subjects as in 'Mother and father are key figures in a child's life'.
    add_subj = []
    for subject in subjects:
        add_subj += list(subject.conjuncts)

    cur_pred = pred
    while len(subjects) == 0 and cur_pred.dep_ == "conj":
        cur_pred = cur_pred.head
        subjects = find_subj(cur_pred)

    subjects += add_subj

    # the subjects' order may be different from sentence order, so we arrange it right
    subjects.sort(key=lambda subj: subj.i)
    
    # the subjects' order may be different from order in sentence, so we rearrange it
    subjects.sort(key=lambda subj: subj.i)
        
    return subjects

In [6]:
def find_pred_subj(doc):
    
    pred_sub = list()
    predicates = []
    conj_pred = dict()
    for token in doc:
        if token.pos_ in ['AUX', 'VERB']:
                 
            # for analytical predicates
            if token.tag_ in ['VBN', 'VBG', 'VB']:
                aux = None
                aux_2 = None
                children = list(token.children)
                for ch in children:
                    if ch.dep_[:3] == 'aux' and ch.pos_ in ['VERB', 'AUX']: 
                        if ch.tag_ != 'VBN' and not aux:
                            aux = ch
                        else:
                            aux_2 = ch
                if aux:
                    if aux_2:
                        pred_sub += [((aux, aux_2, token), find_subj(aux))]
                    else:
                        pred_sub += [((aux, token), find_subj(aux))]

            # all other cases
            elif token.dep_ in ['ROOT', 'ccomp', 'xcomp', 'acl', 'relcl', 'parataxis', 'advcl', 'pcomp']:
                pred_sub += [(tuple([token]), find_subj(token))]

            # conjuncts: when there are multiple predicates connected by conjunction
            elif token.dep_ == 'conj' and token.head in predicates:
                if find_subj(token) != find_subj(token.head):
                    pred_sub += [(tuple([token]), find_subj(token))]
            
            predicates = [t[0][-1] for t in pred_sub]
                        
    for ps in pred_sub:
        p, s = ps[0], ps[1]
        conj = p[-1].conjuncts
        for c in conj:
            if find_subj(c) == s:
                if conj_pred.get(token.head.i):
                    conj_pred[token.head.i].append(c.i)
                else:
                    conj_pred[token.head.i] = [c.i]

    return pred_sub, conj_pred

In [7]:
def subject_chunk(sent, subject):
    '''Extract the subject phrase, 
    i.e. the part of the sentence that contains
    all the subjects' noun phrases'''
    
    subject_phrase = dict()
    
    subject_phrase['start'] = subject[0].i
    subject_phrase['end'] = subject[0].i

    if len(subject) > 1:
        
        ch = []        
        for s in subject:
            ch += list(s.children)
        conjunctions = [i for i in ch if i.tag_ == 'CC']
        if conjunctions:
            subject_phrase['conj'] = conjunctions[-1].text
        else:
            subject_phrase['conj'] = None
    s_chunks = []
    for s in subject:
        try:
            s_chunks += [next(i for i in sent.noun_chunks if s in i)]
        except StopIteration:                
            s_chunks.append([s])
    subject_phrase['start'] = s_chunks[0][0].i
    subject_phrase['end'] = s_chunks[-1][-1].i
    
    subject_phrase['span'] = sent[subject_phrase['start']:subject_phrase['end']+1]
    if (subject_phrase['span'][0].is_sent_start
        and subject_phrase['span'][0].tag_ not in {'NNP', 'NNPS'}
        and subject_phrase['span'][0].text != 'I'):
        first_word = subject_phrase['span'][0].lower_
        subject_phrase['phrase'] = ' '.join([first_word]+[subject_phrase['span'][1:].text])
    else:
        subject_phrase['phrase'] = subject_phrase['span'].text
        
    return subject_phrase

In [8]:
def pred_chunk(sent, pred, conj_p):
    '''Extract the object phrase,
    i.e. the part of the sentence that ocntains all
    conjuct predicates, if any'''
    
    pred_phrase = dict()
    pred_phrase['start'] = min(p.i for p in pred)
    pred_phrase['end'] = max(p.i for p in pred)
    pred_phrase['predicates'] = [pred[-1]]
    pred_phrase['n_conj'] = 1
    
    if conj_p.get(pred[-1].i):
        last_c = max([c for c in conj_p[pred[-1].i]])
        pred_phrase['end'] = max(last_c, pred[-1].i)
        pred_phrase['n_conj'] = len(conj_p[pred[-1].i])+1
        pred_phrase['predicates'] += list(sent[i] for i in conj_p[pred[-1].i])
        
    pred_phrase['span'] = sent[pred_phrase['start']:pred_phrase['end']+1]
    try:
        s = list(j.i for j in pred_phrase['span'] if j.pos_ != 'AUX' and not j.dep_.startswith(('nsubj', 'csubj')))[0]
        e = list(j.i for j in list(pred_phrase['span'])[::-1] if j.pos_ != 'AUX' and not j.dep_.startswith(('nsubj', 'csubj')))[0]
        pred_phrase['span_wo_aux'] = sent[s:e+1]
    except IndexError:
        pred_phrase['span_wo_aux'] = sent[pred_phrase['predicates'][0].i:pred_phrase['end']+1]
    pred_phrase['phrase'] = pred_phrase['span'].text[0].lower()+pred_phrase['span'].text[1:]
    pred_phrase['phrase_wo_aux'] = pred_phrase['span_wo_aux'].text[0].lower()+pred_phrase['span_wo_aux'].text[1:]
    
    # expanding contractions
    if pred_phrase['phrase'].startswith("'m"):
        pred_phrase['phrase'] = 'am '+pred_phrase['phrase'][2:]
    elif pred_phrase['phrase'].startswith("'re"):
        pred_phrase['phrase'] = 'are '+pred_phrase['phrase'][2:]
    elif pred_phrase['phrase'].startswith("'s"):
        if pred_phrase['span'][0].lemma_ == 'be': pred_phrase['phrase'] = 'is '+pred_phrase['phrase'][2:]
        elif pred_phrase['span'][0].lemma_ == 'have': pred_phrase['phrase'] = 'has '+pred_phrase['phrase'][3:]
    elif pred_phrase['phrase'].startswith("'d"):
        if pred_phrase['span'][0].lemma_ == 'would': pred_phrase['phrase'] = 'would '+pred_phrase['phrase'][5:]
        elif pred_phrase['span'][0].lemma_ == 'have': pred_phrase['phrase'] = 'had '+pred_phrase['phrase'][3:]
    elif pred_phrase['phrase'].startswith("'ll"):
        if pred_phrase['span'][0].lemma_ == "'ll": pred_phrase['phrase'] = 'will '+pred_phrase['phrase'][4:]
        

    
    _ = re.search("((?<=\s)ca|(?<=^)ca)(?=(\s|$))", pred_phrase['phrase'])
    if _ and 'ca' in [p.text for p in pred_phrase['predicates']]:
        pred_phrase['phrase'] = pred_phrase['phrase'][:_.start()]+'can'+pred_phrase['phrase'][_.end():]
    
    return pred_phrase

In [9]:
def before_between_after(sent, pred, subj):
    '''Split sentence into three parts:
    - before: everything before predicate or subject, whichever comes first
    - in between: everything between predicate and subject
    - after: everything after predicate or subject, whiever comes last'''
    
    if pred['start'] < subj['start']:
        everything_before = sent[:pred['start']].text
    else:
        everything_before = sent[:subj['start']].text
    if pred['end'] < subj['end']:
        everything_in_between = sent[pred['end']+1:subj['start']].text
        everything_after = sent[subj['end']+1:].text
    else:
        everything_in_between = sent[subj['end']+1:pred['start']].text
        everything_after = sent[pred['end']+1:].text
        
    if everything_after.startswith(("n't", "n`t", "n’t")):
        everything_after = 'not'+everything_after[3:]
        
    return everything_before, everything_in_between, everything_after

In [10]:
@clean_correction
def general_q_simple_verb(sent, predicate, subject, conj_p):
    '''Simple question, like *Is it necessary? Does John come by often?*'''
    
    correction = ''
    
    subj = subject_chunk(sent, subject)
    pred = pred_chunk(sent, predicate, conj_p)
    before, between, after = before_between_after(sent, pred, subj)        
            
    # strong verbs do not need an auxiliary for quesiton
    if (predicate[0].lemma_ in {'be', 'must', 'can', 'could', 'dare'
                               'should', 'shall', 'may', 'might'}) \
       or (predicate[0].lemma_ == 'have' and predicate[0].tag_ == 'AUX'):
        #do we include 'need' here?
        if pred['start'] > subj['start']:                    

            correction = ' '.join([before, pred['phrase'], subj['phrase'], between, after])
    
    # for weak verbs we need to add an auxiliary
    else:
        if predicate[0].morph.to_dict()['Tense'] == 'Past':
            aux = 'did'
        elif (subject[0].lower_ in ['i', 'you'] or
              subject[0].pos_ == 'ADJ' or
              subject[0].morph.to_dict()['Number'] == 'Plur' or
              len(subject) > 1 and subj['conj'] != 'or'):
            aux = 'do'
        else: 
            aux = 'does'
            
        lex_verb = ''
        for word in pred['span']:
            if word in pred['predicates']:
                lex_verb += str(word._.inflect('VB')).lower()+' '
            else:
                lex_verb += word.text+' '
        
        correction = ' '.join([before, aux, subj['phrase'], between, lex_verb, after])
        
    
    return correction

In [11]:
@clean_correction
def subject_q(sent, predicate, subject, conj_p):
    '''Subject question, doesn't need auxiliaries and inversion'''
    
    correction = ''
    
    subj = subject_chunk(sent, subject)
    pred = pred_chunk(sent, predicate, conj_p)
    before, between, after = before_between_after(sent, pred, subj)
        
    if subj['start'] > pred['start']:
        if len(predicate) == 1:
        
            correction  = ' '.join([before, subj['phrase'], between, pred['phrase'], after])
            
        else:
            
            aux2 = ''
            aux1 = predicate[0]
            if len(predicate) == 3:
                aux2 = predicate[1].lower_
                
            if aux1.lemma_ == 'do':
                for i in pred['predicates']:
                    s = i._.inflect(aux1.tag_)
                    pred['phrase_wo_aux'] = re.sub(r'(?<=\b)'+i.text+r'(?=\b)', s, pred['phrase_wo_aux'], count=1)
                aux1 = ''
            else:
                aux1 = aux1.lower_

            correction = ' '.join([before, subj['phrase'], aux1, between, aux2, pred['phrase_wo_aux'], after])        
            
    return correction

In [12]:
@clean_correction
def analytical_verb_q(sent, predicate, subject, conj_p):
    '''For questions that have an analytical predicate'''
    
    correction = ''
    
    subj = subject_chunk(sent, subject)
    pred = pred_chunk(sent, predicate, conj_p)
    before, between, after = before_between_after(sent, pred, subj)
    
    lex_pr = predicate[-1] # lexical verb
    aux = predicate[:-1] # auxiliaries
    aux_n = len(aux)
        
    a2 = lex_pr.i
    a1 = aux[0].i
    if aux_n == 2:
        a2 = aux[1].i
    l = lex_pr.i

    
    if not a1 < subj['start'] < a2 <= l:
        aux2 = ''
        aux1 = predicate[0]
        if aux1.lower_ in ["'d", '`d', '’d']:
            if aux1.lemma_ == 'would':
                aux1 = 'would'
            elif aux1.lemma_ == 'have':
                aux1 = 'had'
        elif aux1.lower_ in ["'ll", '`ll', '’ll']:
            aux1 = 'will'
        elif aux1.lower_ == 'ca':
            aux1 = 'can'
        else:
            aux1 = aux1.lower_
            
        neg = ''
        if sent[aux[0].i+1].lemma_ == 'not':
            neg = 'not'
            if pred['span_wo_aux'][0].lemma_ == 'not':
                pred['phrase_wo_aux'] = pred['phrase_wo_aux'][3:]
            
        if len(predicate) == 3:
            aux2 = predicate[1].lower_
              
        correction = ' '.join([before, aux1, subj['phrase'], between, neg, aux2, pred['phrase_wo_aux'], after])
        
    return correction

In [13]:
def interrogative_WO(sent, pred_subj, conj_p):
    '''Question'''
    
    correction = ''    
    
    predicate = pred_subj[0]
    subject = pred_subj[1]
    subj = subject_chunk(sent, subject)

    if any(w.tag_[0] == 'W' for w in subj['span']):
        # it's a subject question
        correction = subject_q(sent, predicate, subject, conj_p)
    elif len(predicate) == 1:
        correction = general_q_simple_verb(sent, predicate, subject, conj_p)
    elif len(predicate) > 1:
        # we have an analytical predicate
        correction = analytical_verb_q(sent, predicate, subject, conj_p)
             

    return correction

In [14]:
@clean_correction
def possible_inversion(sent, pred_subj, conj_p, culprit):
    '''A sentence where the predicate is negated, 
       could possibly be an inversion'''
    
    correction = ''
    
    predicate = pred_subj[0]
    pred = pred_chunk(sent, predicate, conj_p)
    subject = pred_subj[1]
    subj = subject_chunk(sent, subject)
    
    before, between, after = before_between_after(sent, pred, subj)
                
    if culprit.i < subject[0].i < predicate[0].i:
        
            if len(predicate) == 1:
                
                if (predicate[0].lemma_ in {'be', 'must', 'can', 'could', 'dare'
                                            'should', 'shall', 'may', 'might'}):
                    correction = ' '.join([before, pred['phrase'], between, subj['phrase'], after])

                else:
                    tense = predicate[0].morph.to_dict().get('Tense')
                    if tense == 'Past':
                        aux = 'did'
                    elif (subject[0].lower_ in ['i', 'you'] or
                          subject[0].pos_ == 'ADJ' or
                          subject[0].morph.to_dict()['Number'] == 'Plur' or
                          len(subject) > 1 and subj['conj'] != 'or'):
                        aux = 'do'
                    else: 
                        aux = 'does'

                    lex_verb = ''
                    for word in pred['span_wo_aux']:
                        if word in pred['predicates']:
                            lex_verb += str(word._.inflect('VB')).lower()+' '
                        else:
                            lex_verb += word.text+' '
                        
                    correction = ' '.join([before, aux, subj['phrase'], between, lex_verb, after])

            
            else:
                
                a2 = predicate[-1].i
                a1 = predicate[0].i
                if len(predicate) == 3:
                    a2 = predicate[1].i
                l = predicate[-1].i

                if not a1 < subj['start'] < a2 <= l:
                    aux2 = ''
                    aux1 = predicate[0].lower_
                    lex_verb = pred['phrase_wo_aux']
                    if len(predicate) == 3:
                        aux2 = predicate[1].lower_

                    correction = ' '.join([before, aux1, subj['phrase'], between, aux2, lex_verb, after])

    
    return correction

In [15]:
@clean_correction
def standard_word_order(sent, pred_subj, conj_p):
    '''SV'''
    
    correction = ''
    
    predicate = pred_subj[0]
    pred = pred_chunk(sent, predicate, conj_p)
    subject = pred_subj[1]
    subj = subject_chunk(sent, subject)
    
    before, between, after = before_between_after(sent, pred, subj)
        
    if not subj['start'] < pred['start'] \
       and predicate[-1].dep_ != 'advcl' \
       and not (pred['span'][0].lemma_ in ['may', 'let'] and len(predicate) > 1):
        # advcl for cases like Were she here, she would support me
        # may and let for jussive cases
        if len(predicate) == 1:
            correction  = ' '.join([before, subj['phrase'], between, pred['phrase'], after])
            
        else:
            aux2 = ''
            aux1 = predicate[0]
            if len(predicate) == 3:
                aux2 = predicate[1].lower_
                        
            if aux1.lemma_ == 'do':
                for i in pred['predicates']:
                    s = i._.inflect(aux1.tag_)
                    pred['phrase_wo_aux'] = re.sub(r'(?<=\b)'+i.text+r'(?=\b)', s, pred['phrase_wo_aux'], count=1)
                aux1 = ''
            else:
                aux1 = aux1.lower_
                
            correction = ' '.join([before, subj['phrase'], aux1, between, aux2, pred['phrase_wo_aux'], after])
    
    return correction

In [16]:
def main(text):
    
    doc = nlp(text)
    correction = ''
    ps, conj_p = find_pred_subj(doc)
    ps = [ps_ for ps_ in ps if ps_[1]]

    if ps:
        q_mark = False
        for token in doc:
            if token.text == '?': q_mark = token.i
        ps_before_q = []
        for ps_ in ps:
            if ps_[0][-1].i < q_mark: ps_before_q += [ps_]
                
        if q_mark:
            # there is a question in the sentence
            pred_subjs = [next(ps_ for ps_ in ps_before_q[::-1] 
                               if ps_[0][-1].dep_ not in ['advcl', 'relcl'] and ps_[0][-1].head.dep_ not in ['advcl', 'relcl'])]
            pred_subjs += list(ps_ for ps_ in ps
                               if ps_[0][-1] in pred_subjs[0][0][-1].conjuncts
                                  and all(c not in [ch.lower_ for ch in ps_[0][-1].children] for c in ['but', 'so']))
            new_corr = ''
            for i in range(len(pred_subjs)):
                if i <= len(pred_subjs)-1 and pred_subjs[i][1]:
                    # checking i <= is necessary because the length of pred_subjs may change within the cycle
                    new_corr = interrogative_WO(doc, pred_subjs[i], conj_p)
                    if new_corr:
                        correction = new_corr
                        doc = nlp(correction)
                        ps, conj_p = find_pred_subj(doc)
                        ps = [ps_ for ps_ in ps if ps_[1]]
                        pred_subjs = [next(ps_ for ps_ in ps_before_q[::-1] if ps_[0][-1].dep_ not in ['advcl', 'relcl'] 
                                           and ps_[0][-1].head.dep_ not in ['advcl', 'relcl'])]+\
                                      list(ps_ for ps_ in ps
                                          if (ps_[0][-1] in pred_subjs[0][0][-1].conjuncts and
                                          all(c not in [ch.lower_ for ch in ps_[0][-1].children] for c in ['but', 'so'])))
                
        else:
            
            for i in range(len(ps)):
                if i <= len(ps)-1:
                    # checking i <= is necessary because the length of pred_subjs may change within the cycle
                    p = ps[i][0][-1]
                    for child in p.lefts:
                        if (child.dep_ == 'neg' or \
                           child.lower_ in NON_NEGATIVE_INVERSION):
                            new_corr = possible_inversion(doc, ps[i], conj_p, child)
                            if new_corr:
                                correction = new_corr
                                doc = nlp(correction)
                                ps, conj_p = find_pred_subj(doc)
                                ps = [ps_ for ps_ in ps if ps_[1]]
                                
                if i <= len(ps)-1:                                
                    if not ('ADV' in [t.pos_ for t in p.lefts] or any(i in [t.dep_ for t in p.lefts] for i in ['expl', 'prep'])):
                        # clauses with adverbs first (e.g. here, there and so) can have both inversion and standard word order, 
                        # so we don't consider those
                            new_corr = standard_word_order(doc, ps[i], conj_p)
                            if new_corr:
                                correction = new_corr
                                doc = nlp(correction)
                                ps, conj_p = find_pred_subj(doc)
                                ps = [ps_ for ps_ in ps if ps_[1]]
                                
            subject_spans = []
            for ps_ in ps:
                if ps_[1]: 
                    span = subject_chunk(doc, ps_[1])['span']
                    subject_spans += [word for word in span]
            predicates = [ps_[0][-1].i for ps_ in ps]
            for root, conjs in conj_p.items():
                predicates += conjs

            for w in doc:
                
                if (w.lower_ in ['nowhere', 'only'] 
                    and w not in subject_spans 
                    and w.head.i not in predicates) \
                   or (w.lower_ == 'no' 
                       and not w.is_sent_end
                       and not w.is_sent_start 
                       and w.nbor().lower_ in ['way', 'circumstances', 'condition', 'conditions', 'point'] 
                       and w.nbor(-1).lower_ in ['under', 'in', 'on', 'at']) \
                   or (w.lower_ == 'no' 
                       and not w.is_sent_end 
                       and w.nbor().lower_ == 'sooner'):
                    pred_to_inspect, w_c = None, w
                    while not pred_to_inspect:
                        p = w_c.head
                        if p.i in predicates and p.dep_ not in ['advcl', 'relcl']:
                            pred_to_inspect = p
                        else:
                            if w_c == p:
                                pred_to_inspect = w # so that it doesn't loop indefinitely
                            w_c = p
                    if pred_to_inspect.i > w.i:
                        try:
                            ps_to_inspect = next(p for p in ps if p[0][-1] == pred_to_inspect)
                            new_corr = possible_inversion(doc, ps_to_inspect, conj_p, w)
                            if new_corr:
                                correction = new_corr
                                doc = nlp(correction)
                                ps, conj_p = find_pred_subj(doc)
                                ps = [ps_ for ps_ in ps if ps_[1]]
                        except StopIteration:
                            pass
                        
                        
    return correction
                

NON_NEGATIVE_INVERSION = {'hardly', 'scarcely', 'barely', 'rarely', 'little', 'seldom'}

Enter the text you want to correct below or copy and paste it into the variable.

In [33]:
text = input()
# text = ''

Let us discuss pluses and minuses about it and what should government do.


In [34]:
text = sent_tokenize(text)

In [35]:
final = ''
for sent in text:
    sent = re.sub("’|‘", "'", sent)
    corr = main(sent)
    if corr:
        final += ' '+corr
    else:
        final += ' '+sent

In [36]:
final = final.strip()
final

'Let us discuss pluses and minuses about it and what government should do.'

Or if you want to see the difference highlighted:

In [37]:
from difflib import Differ
diff = Differ()

for sent in text:
    sent = re.sub("’|‘", "'", sent)
    corr = main(sent)
    if corr:
        print('\n'.join(diff.compare([sent], [corr])))

- Let us discuss pluses and minuses about it and what should government do.
?                                                     -------

+ Let us discuss pluses and minuses about it and what government should do.
?                                                                +++++++

