In [1]:
import random
import bz2
import spacy
nlp = spacy.load('en_core_web_sm', disable=['ner', 'textcat'])

# Data

In [None]:
https://newsela.com/read/elem-country-haiti/id/42605/
https://github.com/shashiongithub/Split-and-Rephrase

In [2]:
compl_to_sim = {}
source_file = bz2.BZ2File("tokenized_geougraphy.txt.bz2", "r")
for line in source_file:
    texts = line.split('\t')

    if len(texts) != 2:
        continue
    try:
        sim_u = texts[0].encode('utf-8')
        compl_u = texts[1].encode('utf-8')
    except:
        continue
    compl_to_sim[texts[0]] = texts[1]


In [3]:
len(compl_to_sim)

8381

In [4]:
compl, sim = random.choice(list(compl_to_sim.items()))

print 'complex sentence:'
print compl
print '\nsimple text given:'
print sim

complex sentence:
Alpharetta , FultonCounty , Georgia is located within the UnitedStates .

simple text given:
Alpharetta , is in Georgia , in the UnitedStates . Alpharetta is part of FultonCounty in Georgia .



# Rules

In [5]:
# Some words does not make sense (or change sense) when used without dependent clause.
# For example, "part" in "part of the U.S.A." or "length" in "length of 4000".
# This function builds a clause required to save such information.

def getReasonableChunk(word, doc):
    chunk_words = []
    if word.pos_ == 'PROPN':
        try:
            chunks = list(x for x in doc.noun_chunks if x.root == word)[0]
        except:
            chunks = [word]
        chunk_words.extend(list(x for x in chunks))
        inner_conj = [x for x in word.children if x.dep_ == 'cc']
        if len(inner_conj) > 0:
            chunk_words.extend(inner_conj)
            for conj_propn in list(x for x in word.children if (x.dep_ == 'conj') \
                                   and (x.pos_ == 'PROPN')):
                chunk_words.extend(getReasonableChunk(conj_propn, doc))

    elif word.pos_ == 'NOUN':
        nummods = list(x for x in word.children if x.dep_ == 'nummod')
        chunk_words.append(word)
        chunk_words.extend(nummods)
        
        dets = list(x for x in word.children if x.dep_ == 'det')
        chunk_words.extend(dets)
        
        compunds = list(x for x in word.children if x.dep_ == 'compound')
        for c in compunds:
            chunk_words.extend(getReasonableChunk(c, doc))
            
        optional_mods = list(x for x in word.children if (x.dep_ == 'amod') \
                             and (x.pos_ in ['ADJ', 'VERB']))
        r = random.random()
        if r > 0.5:
            for x in optional_mods:
                chunk_words.extend([t for t in x.subtree])
        
    elif word.text.lower() in ['which', 'that']:
        replacer = replaceWhich(word, doc)
        if replacer is not None:
            chunk_words.extend(getReasonableChunk(replacer, doc))
        else:
            chunk_words.append(word)
    elif word.text.lower() == 'who':
        replacer = replaceWhich(word, doc)
        if replacer is not None:
            chunk_words.extend(getReasonableChunk(replacer, doc))     
        else:
            chunk_words.append(word)
    else:
        chunk_words.append(word)

        
    prepositions = list(x for x in word.children if x.dep_ == 'prep')
    pobjects = []
    for prep in prepositions:
        pobjects = list(x for x in prep.children if x.dep_ == 'pobj')
        if len(pobjects) > 0:
            chunk_words.append(prep)
            for s in pobjects:
                chunk_words.extend(getReasonableChunk(s, doc))
                    
    return chunk_words

In [6]:
# convert list of lists of words to text (form readable sentences)
def wordsToSimpleText(sentence_words):
    text = ''
    for words in sentence_words:
        sentence_text = ''
        for word in words:
            sentence_text += word.text + ' '
            
        if len(sentence_text) > 0:
            sentence_text = sentence_text[0].upper() + sentence_text[1:]
            sentence_text += '. '
            
            text += sentence_text
            
    text = text.replace(" which ", " this ")
    text = text.replace("Which ", "This ")
    text = text.replace(" where", " here")
    text = text.replace("Where ", "Here ")
    text = text.replace("What 's", "It 's")
    return text

In [7]:
# Rule 1
# build set of simple sentences based on each verb's subtree in a complex sentence
# input: spacy.doc with complex sentences
# output: text with one or more simple sentences

def subjVerbObjToSentence(sentence, verb_simple_texts):
    verbs = list(x for x in sentence if x.pos_ == 'VERB')
    new_sentences = []
    for v in verbs:
        new_sent_words = [v]
        has_sence = False
        
        ouxilaries = list(x for x in v.children if (x.dep_ == 'aux') or (x.dep_ == 'auxpass'))
        new_sent_words.extend(ouxilaries)
        
        advmodes = list(x for x in v.children if (x.dep_ == 'advmod') and (x.text in ['where']))
        new_sent_words.extend(advmodes)

        subjects = list(x for x in v.children if (x.dep_ == 'nsubj') or (x.dep_ == 'nsubjpass'))
        if len(subjects) > 0:
            has_sence = True
            for s in subjects:
                new_sent_words.extend(getReasonableChunk(s, sentence))

        attributes = list(x for x in v.children if (x.dep_ == 'attr'))
        if len(attributes) > 0:
            has_sence = True
            for s in attributes:
                new_sent_words.extend(getReasonableChunk(s, sentence))

        negs = list(x for x in v.children if (x.dep_ == 'neg'))
        if len(negs) > 0:
            for s in negs:
                new_sent_words.extend(getReasonableChunk(s, sentence))
        
        dobjects = list(x for x in v.children if x.dep_ in ['dobj', 'oprd', 'acomp'])
        if len(dobjects) > 0:
            has_sence = True
            for s in dobjects:
                new_sent_words.extend(getReasonableChunk(s, sentence))
   
        prepositions = list(x for x in v.children if x.dep_ == 'prep')
        pobjects = []
        for prep in prepositions:
            pobjects = list(x for x in prep.children if x.dep_ == 'pobj')
            if len(pobjects) > 0:
                has_sence = True
                new_sent_words.append(prep)
                for s in pobjects:
                    new_sent_words.extend(getReasonableChunk(s, sentence))
                    
        agents = list(x for x in v.children if x.dep_ == 'agent')
        pobjects = []
        for prep in agents:
            pobjects = list(x for x in prep.children if x.dep_ == 'pobj')
            if len(pobjects) > 0:
                has_sence = True
                new_sent_words.append(prep)
                for s in pobjects:
                    new_sent_words.extend(getReasonableChunk(s, sentence))
        if (v.dep_ == 'conj') and (len([x for x in v.children \
                                        if x.dep_ in ['nsubj', 'nsubjpass']]) == 0):
            subjects = list(x for x in v.head.children \
                            if (x.dep_ in ['nsubj', 'nsubjpass', 'aux']))
            if len(subjects) > 0:
                has_sence = True
                for s in subjects:
                    new_sent_words.extend(getReasonableChunk(s, sentence))
        
        if v.dep_ in ['xcomp', 'advcl']:
            h = v
            i = 0
            while (True):
                i += 1
                if i > 5 : 
                    break
                h = h.head
                new_sent_words.append(h)
                
                if h.pos_ == 'VERB':
                    has_sence = True
                    break
            dobjs = [x for x in h.children if x.dep_ == 'dobj']
            new_sent_words.extend(dobjs)

        new_sent_words = sorted(new_sent_words, key=lambda x: x.i)
        
        if has_sence == True:
            new_sentences.append(new_sent_words)
        if has_sence == True:
            simple_text = wordsToSimpleText([new_sent_words])
            if v.dep_ == 'acl':
                simple_text = "It is " + simple_text
            verb_simple_texts.append(simple_text)
    return verb_simple_texts


In [8]:
# Rule 2
# Nouns with some related words sometimes bring some imporant 
# information skipped in previous rule to keep sentence simple.
# This function forms separate sentences for such nouns.
def adjNounToSentence(sentence, noun_simple_texts):
    nouns = list(x for x in sentence if x.pos_ == 'NOUN')
    for n in nouns:
        new_sent_words = [n]
        has_sence = False
        
        exceptions = ['many', 'more', 'other', 'main']
        optional_mods = list(x for x in n.children if (x.dep_ == 'amod') \
                             and (x.pos_ in ['ADJ', 'VERB']) and (x.text.lower() not in exceptions))

        verb = "is"
        if n.tag_ == 'NNS':
            verb = "are"
                
        add_mods = []
        for mod in optional_mods:
            add_mods.extend(list(x for x in mod.subtree))

        if len(add_mods) > 0:
            new_sent_words = sorted(add_mods, key=lambda x: x.i)
            noun_chunk = [n]
            noun_chunk = sorted(noun_chunk, key=lambda x: x.i)

            new_sent_texts = [x.text for x in noun_chunk]
            new_sent_texts.append(verb)
            new_sent_texts.extend([x.text for x in new_sent_words])
            
            r = random.random()
            if r > 0.5:
                
                sent_text = ' '.join(new_sent_texts)
                sent_text = sent_text[0].upper() + sent_text[1:]
                sent_text += ' . '

                noun_simple_texts[n.text] = sent_text

        appos = list(x for x in n.children if (x.dep_ == 'appos'))
        for ap in appos:
            new_sent_words = sorted([x for x in ap.subtree], key=lambda x: x.i)
            sent_text = n.text + ' ' + verb + ' ' \
            + ' '.join([x.text for x in new_sent_words]) + '. '
            sent_text = sent_text[0].upper() + sent_text[1:]
            noun_simple_texts[n.text] = sent_text
            
        in_prep = list(x for x in n.children if (x.dep_ == 'prep') and \
                       (x.text in ['in', 'at', 'on']))
        for ap in in_prep:
            pobjs = list(x for x in ap.children if x.dep_ == 'pobj')
            for p in pobjs:
                new_sent_words = sorted([x for x in p.subtree], key=lambda x: x.i)
                new_sent_words = getReasonableChunk(p, sentence)
                sent_text = n.text + ' ' + verb + ' ' + ap.text + ' ' \
                + ' '.join([x.text for x in new_sent_words]) + ' . '
                sent_text = sent_text[0].upper() + sent_text[1:]
                noun_simple_texts[n.text] = sent_text
            
    return noun_simple_texts

In [9]:
# Rule 3
# Geography texts often contain constructions like "City, Region, Country" 
# that describe that first entity belongs (is located inside) to second, 
# and second one belongs to third.
# This function forms full sentences to describe such facts.
# Also, similar to rule2 is used for proper nouns.

def propnConjToSentence(sentence, noun_simple_texts):
    nouns = list(x for x in sentence if x.pos_ == 'PROPN')
    parts = [' is in ', ' is located in ']
    for n in nouns:
        appos = list(x for x in n.children if (x.dep_ == 'appos'))
        for ap in appos:
            new_sent_words = sorted([x for x in ap.subtree], key=lambda x: x.i)
            sent_text = n.text + ' is ' + ' '.join([x.text for x in new_sent_words]) + '. '
            sent_text = sent_text[0].upper() + sent_text[1:]
            noun_simple_texts[n.text] = sent_text
            
        if len([x for x in n.children if x.dep_ == 'cc']) > 0:
            continue
        for aggr in list(x for x in n.children if (x.dep_ in ['conj', 'appos']) \
                         and (x.pos_ == 'PROPN')):
        
            sent_text = n.text + random.choice(parts) + aggr.text + ' . '

            noun_simple_texts[n.text] = sent_text

    return noun_simple_texts

In [10]:
# Rule 4
# Coreference resolution for which, that
def replaceWhich(token, doc):
    steps = 0
    while token.dep_ != 'root':
        steps += 1
        if steps > 50: 
            break
        token = token.head
        if token.pos_ in ['NOUN', 'PROPN']:
            return token
        
    return None

In [11]:
# Rule 5
# Coreference resolution for Who
def replaceWho(token, doc):
    steps = 0
    while token.dep_ != 'root':
        steps += 1
        if steps > 50: 
            break
        token = token.head
        if token.pos_ in ['PROPN']:
            return token
        
    return None

# Main

In [12]:
# put rules together
def simplifySentence(compl):
    doc = nlp(compl.decode('utf-8'), disable=['ner', 'textcat'])
    verb_s_texts = []
    verb_s_texts = subjVerbObjToSentence(doc, verb_s_texts)
    noun_s_texts = {}
    noun_s_texts = adjNounToSentence(doc, noun_s_texts)
    noun_s_texts = propnConjToSentence(doc, noun_s_texts)

    simple_text = ''
    last_found = 0
    for sent in verb_s_texts:
        simple_text += sent
        st = sent.lower()
        for n in noun_s_texts.keys():
            check_sent = noun_s_texts[n].lower().replace(" is ", " ")
            check_sent = check_sent.replace(".", "")
            if (st.find(n.lower()) >= 0) and (st.find(check_sent) < 0):
                simple_text += noun_s_texts[n]
                noun_s_texts[n] = ''
                break
    return simple_text

In [60]:
len(compl_to_sim)

8381

In [49]:
compl, sim = random.choice(list(compl_to_sim.items()))
print 'complex sentence:'
print compl
print '\nsimple text given:'
print sim

complex sentence:
In order to bring some order to all of these geographical variations , and to provide a constant point of reference , a datum or base level was established based on averaging out the elevation of sea level from many tide gauges over an extended period of time .

simple text given:
To make things simpler , a standard sea level was made . It was based on averages around the world .



In [39]:
# Read, simplify and display random data item
compl, sim = random.choice(list(compl_to_sim.items()))
simple_text = simplifySentence(compl)

print 'complex sentence:'
print compl
print '\nsimple text generated:'
print simple_text    

complex sentence:
300 NorthLaSalle is located in Chicago , Illinois , where RahmEmanuel is a leader .

simple text generated:
300 NorthLaSalle is located in Chicago . Chicago is in Illinois . Here RahmEmanuel is a leader . 


In [29]:
other_sent = u'Records of his life, apart from later records of creditors, end after an outbreak of plague in the city in 1451. '

In [30]:
simple_text = simplifySentence(other_sent)

print 'complex sentence:'
print other_sent
print '\nsimple text generated: '
print simple_text

complex sentence:
Records of his life, apart from later records of creditors, end after an outbreak of plague in the city in 1451. 

simple text generated: 
Records of life end after an outbreak of plague in the city in 1451 . Records are later . 
