In [289]:
import random
import json
import re
import os
import bz2
import spacy
nlp = spacy.load('en_core_web_sm', disable=['ner', 'textcat'])
import gensim

# from textstat.textstat import textstat
# import matplotlib.pyplot as plt

# from nlgeval import compute_metrics
# import kenlm

# Functions to read and prepare data

In [3]:
# extract complex and simple sentences and text category 
# from all data for one sentence in DBPedia dataset

def parseSentenceData(data):
    data = data.strip().split("\n\n")
    
    complexsentdata = data[0].strip().split("\n")
    complexid = int(complexsentdata[0].split("-")[1].strip())
    complexsent = complexsentdata[1].strip()

    cat = '' 
    mr_dict = {}
    # Collect all complex mrs
    for item in data[1:]:
        if re.match('COMPLEX-'+str(complexid)+':MR-[0-9]*\n', item):
            # print item
            mrdata = item.strip().split("\n")
            mrid = mrdata[0]
            mr = mrdata[1]
            if re.match('category=[a-zA-Z]* eid=Id[0-9]* size=[0-9]*', mr):
                cat = mr.strip().split(' ')[0].split('=')[1]
            mr_dict[mrid] = [mr, {}]
    
    simpsents = {}
    for item in data[1:]:
        if re.match('COMPLEX-'+str(complexid)+':MR-[0-9]*:SIMPLE-[0-9]*\n', item):
            
            mrid = ":".join(item.strip().split("\n")[0].split(":")[:2])

            sents = (" ".join(item.strip().split("\n")[1:])).strip()
            
            if sents not in simpsents:
                simpsents[sents] = 1

            if sents not in mr_dict[mrid][1]:
                mr_dict[mrid][1][sents] = 1

    return complexsent, simpsents, cat

In [4]:
# remove some frequent garbage from sentences 
def preprocessSentence(text):
    text = re.sub('-LRB-(.*?)-RRB-', '', text)
    text = re.sub('-RRB-', '', text)
    text = re.sub('^\"', '', text)
    text = re.sub('\"$', '', text)
    text = re.sub('\\n', '', text)
    text = re.sub('\\r', '', text)
    text = re.sub('\\t', '', text)
    text = re.sub('\`', '', text)
    text = re.sub('\'\'', '', text)
    return text

In [5]:
# Read DBPedia dataset
def readDBPediaData(compl_to_sim):
    filename = '../dataset/complexsimple.txt.bz2'
    with bz2.BZ2File(filename, "r") as content:         
        sentdata = []
        i = 0
        close_to_geo_categories = ['Monument', 'City', 'Airport', 'Building']

        for line in content:
            if len(sentdata) == 0:
                sentdata.append(line)
            else:
                if re.match('COMPLEX-[0-9]*\n', line):
                    complexsent, simpsents, cat = parseSentenceData("".join(sentdata))
                    simple_sen = simpsents.keys()[0]
                    
                    if (cat in close_to_geo_categories) and (len(simple_sen) > 0):
                        compl_to_sim[preprocessSentence(complexsent)] = preprocessSentence(simple_sen)
                    sentdata = [line]
                else:
                    sentdata.append(line)
    return compl_to_sim

In [6]:
# read Newsela dataset
def readNewselaData(compl_to_sim):
    with bz2.BZ2File('../dataset/clear_newsela.txt.bz2', "r") as content:         
        for line in content:
            texts = line.split('\t')

            if len(texts) != 3:
                continue
            if (len(texts[0]) < 10) or (len(texts[1]) < 10):
                continue
                
            compl_to_sim[preprocessSentence(texts[0])] = preprocessSentence(texts[1])
    return compl_to_sim

In [51]:
with open("../dataset/tokenized_geougraphy.txt", "w") as f:
    for com, sim in tokenized_texts.iteritems():
        f.write(com + '\t' + sim + '\n')

In [6]:
# split data to train and test set
def splitTrainTest(compl_to_sim):
    train = {}
    test = {}

    for compl, sim in compl_to_sim.iteritems():
        try:
            sim_u = sim.encode('utf-8')
            compl_u = compl.encode('utf-8')
        except:
            continue
        r = random.random()
        if r < 0.8:
            train[compl] = sim
        else:
            test[compl] = sim
    return train, test

# Functions for baseline logic

In [308]:
# Some words does not make sense (or change sense) when used without dependent clause.
# For example, "part" in "part of the U.S.A." or "length" in "length of 4000".
# This function builds a clause required to save such information.

def getReasonableChunk(word, doc):
    chunk_words = []
    if word.pos_ == 'PROPN':
        try:
            chunks = list(x for x in doc.noun_chunks if x.root == word)[0]
        except:
            chunks = [word]
        chunk_words.extend(list(x for x in chunks))
        inner_conj = [x for x in word.children if x.dep_ == 'cc']
        if len(inner_conj) > 0:
            chunk_words.extend(inner_conj)
            for conj_propn in list(x for x in word.children if (x.dep_ == 'conj') and (x.pos_ == 'PROPN')):
                chunk_words.extend(getReasonableChunk(conj_propn, doc))

    elif word.pos_ == 'NOUN':
        nummods = list(x for x in word.children if x.dep_ == 'nummod')
        chunk_words.append(word)
        chunk_words.extend(nummods)
        
        dets = list(x for x in word.children if x.dep_ == 'det')
        chunk_words.extend(dets)
        
        compunds = list(x for x in word.children if x.dep_ == 'compound')
#         chunk_words.extend(compunds)
        for c in compunds:
            chunk_words.extend(getReasonableChunk(c, doc))
    elif word.text.lower() in ['which', 'that']:
        replacer = replaceWhich(word, doc)
        if replacer is not None:
            chunk_words.extend(getReasonableChunk(replacer, doc))
        else:
            chunk_words.append(word)
    elif word.text.lower() == 'who':
        replacer = replaceWhich(word, doc)
        if replacer is not None:
            chunk_words.extend(getReasonableChunk(replacer, doc))     
        else:
            chunk_words.append(word)
    else:
        chunk_words.append(word)

        
    prepositions = list(x for x in word.children if x.dep_ == 'prep')
    pobjects = []
    for prep in prepositions:
        pobjects = list(x for x in prep.children if x.dep_ == 'pobj')
        if len(pobjects) > 0:
            chunk_words.append(prep)
            for s in pobjects:
                chunk_words.extend(getReasonableChunk(s, doc))
                    
    return chunk_words

In [314]:
# convert list of lists of words to text (form readable sentences)
def wordsToSimpleText(sentence_words):
    text = ''
    for words in sentence_words:
        sentence_text = ''
        for word in words:
            sentence_text += word.text + ' '
            
        if len(sentence_text) > 0:
            sentence_text = sentence_text[0].upper() + sentence_text[1:]
            sentence_text += '. '
            
            text += sentence_text
            
    text = text.replace(" which ", " this ")
    text = text.replace("Which ", "This ")
    return text

In [369]:
# main baseline function
# build set of simple sentences based on each verb's subtree in a complex sentence
# input: spacy.doc with complex sentences
# output: text with one or more simple sentences

def subjVerbObjToSentence(sentence, verb_simple_texts):
    verbs = list(x for x in sentence if x.pos_ == 'VERB')
    new_sentences = []
    for v in verbs:
        new_sent_words = [v]
        has_sence = False
        
        ouxilaries = list(x for x in v.children if (x.dep_ == 'aux') or (x.dep_ == 'auxpass'))
        new_sent_words.extend(ouxilaries)
        
        print 'ouxilaries', new_sent_words
        subjects = list(x for x in v.children if (x.dep_ == 'nsubj') or (x.dep_ == 'nsubjpass'))
        if len(subjects) > 0:
            has_sence = True
            for s in subjects:
                new_sent_words.extend(getReasonableChunk(s, sentence))
        print 'subjects', new_sent_words
        attributes = list(x for x in v.children if (x.dep_ == 'attr'))
        if len(attributes) > 0:
            has_sence = True
            for s in attributes:
                new_sent_words.extend(getReasonableChunk(s, sentence))
        print 'attributes', new_sent_words
        negs = list(x for x in v.children if (x.dep_ == 'neg'))
        if len(negs) > 0:
            for s in negs:
                new_sent_words.extend(getReasonableChunk(s, sentence))
        print 'neg', new_sent_words        
        dobjects = list(x for x in v.children if x.dep_ in ['dobj', 'oprd', 'acomp'])
        if len(dobjects) > 0:
            has_sence = True
            for s in dobjects:
                new_sent_words.extend(getReasonableChunk(s, sentence))
        print 'dobjects', new_sent_words    
        prepositions = list(x for x in v.children if x.dep_ == 'prep')
        pobjects = []
        for prep in prepositions:
            pobjects = list(x for x in prep.children if x.dep_ == 'pobj')
            if len(pobjects) > 0:
                has_sence = True
                new_sent_words.append(prep)
                for s in pobjects:
                    new_sent_words.extend(getReasonableChunk(s, sentence))
        print 'pobjects', new_sent_words           
        agents = list(x for x in v.children if x.dep_ == 'agent')
        pobjects = []
        for prep in agents:
            pobjects = list(x for x in prep.children if x.dep_ == 'pobj')
            if len(pobjects) > 0:
                has_sence = True
                new_sent_words.append(prep)
                for s in pobjects:
                    new_sent_words.extend(getReasonableChunk(s, sentence))
        print 'pobjects', new_sent_words
        if (v.dep_ == 'conj') and (len([x for x in v.children if x.dep_ in ['nsubj', 'nsubjpass']]) == 0):
            subjects = list(x for x in v.head.children if (x.dep_ in ['nsubj', 'nsubjpass', 'aux']))
            if len(subjects) > 0:
                has_sence = True
                for s in subjects:
                    new_sent_words.extend(getReasonableChunk(s, sentence))
        print 'subjects', new_sent_words
#         print v.dep_
        if v.dep_ in ['xcomp', 'advcl']:
            h = v
            i = 0
            while (True):
#                 print h.text
                i += 1
                if i > 5 : 
                    break
                h = h.head
                new_sent_words.append(h)
                
                if h.pos_ == 'VERB':
                    has_sence = True
                    break
        print 'parent verb', new_sent_words
        
#         new_sent_words = list(set(new_sent_words))
        new_sent_words = sorted(new_sent_words, key=lambda x: x.i)
        
        if has_sence == True:
            new_sentences.append(new_sent_words)
        if has_sence == True:
            simple_text = wordsToSimpleText([new_sent_words])
            if v.dep_ == 'acl':
                simple_text = "It is " + simple_text
            verb_simple_texts.append(simple_text)
    return verb_simple_texts


In [355]:
def adjNounToSentence(sentence, noun_simple_texts):
    nouns = list(x for x in sentence if x.pos_ == 'NOUN')
    for n in nouns:
#         print n.text
        new_sent_words = [n]
        has_sence = False
        
        optional_mods = list(x for x in n.children if (x.dep_ == 'amod') and (x.pos_ in ['ADJ', 'VERB']))

        verb = "is"
        if n.tag_ == 'NNS':
            verb = "are"
                
        add_mods = []
        for mod in optional_mods:
            add_mods.extend(list(x for x in mod.subtree))

        if len(add_mods) > 0:
            new_sent_words = sorted(add_mods, key=lambda x: x.i)
            noun_chunk = [n]
            noun_chunk = sorted(noun_chunk, key=lambda x: x.i)

            new_sent_texts = [x.text for x in noun_chunk]
            new_sent_texts.append(verb)
            new_sent_texts.extend([x.text for x in new_sent_words])

            sent_text = ' '.join(new_sent_texts)
            sent_text = sent_text[0].upper() + sent_text[1:]
            sent_text += '. '

            noun_simple_texts[n.text] = sent_text


        appos = list(x for x in n.children if (x.dep_ == 'appos'))
        for ap in appos:
            new_sent_words = sorted([x for x in ap.subtree], key=lambda x: x.i)
            sent_text = n.text + ' ' + verb + ' ' + ' '.join([x.text for x in new_sent_words]) + '. '
            sent_text = sent_text[0].upper() + sent_text[1:]
            noun_simple_texts[n.text] = sent_text
            
        in_prep = list(x for x in n.children if (x.dep_ == 'prep') and (x.text in ['in', 'at', 'on']))
#         print in_prep
        for ap in in_prep:
            pobjs = list(x for x in ap.children if x.dep_ == 'pobj')
            for p in pobjs:
                new_sent_words = sorted([x for x in p.subtree], key=lambda x: x.i)
                sent_text = n.text + ' ' + verb + ' ' + ap.text + ' ' + ' '.join([x.text for x in new_sent_words]) + '. '
                sent_text = sent_text[0].upper() + sent_text[1:]
                noun_simple_texts[n.text] = sent_text
            
    return noun_simple_texts

In [381]:
def propnConjToSentence(sentence, noun_simple_texts):
    nouns = list(x for x in sentence if x.pos_ == 'PROPN')
    parts = [' is in ', ' is located in ']
    for n in nouns:
        appos = list(x for x in n.children if (x.dep_ == 'appos'))
        for ap in appos:
            new_sent_words = sorted([x for x in ap.subtree], key=lambda x: x.i)
            sent_text = n.text + ' is ' + ' '.join([x.text for x in new_sent_words]) + '. '
            sent_text = sent_text[0].upper() + sent_text[1:]
            noun_simple_texts[n.text] = sent_text
            
        if len([x for x in n.children if x.dep_ == 'cc']) > 0:
            continue
        for aggr in list(x for x in n.children if (x.dep_ in ['conj', 'appos']) and (x.pos_ == 'PROPN')):
        
            sent_text = n.text + random.choice(parts) + aggr.text + '. '

            noun_simple_texts[n.text] = sent_text
        

        
    return noun_simple_texts

In [318]:
def replaceWhich(token, doc):
    steps = 0
    while token.dep_ != 'root':
        steps += 1
        if steps > 50: 
            break
        token = token.head
        if token.pos_ in ['NOUN', 'PROPN']:
            return token
        
    return None

In [319]:
def replaceWho(token, doc):
    steps = 0
    while token.dep_ != 'root':
        steps += 1
        if steps > 50: 
            break
        token = token.head
        if token.pos_ in ['PROPN']:
            return token
        
    return None

In [412]:
compl, sim = random.choice(list(tokenized_texts.items()))

In [413]:


doc = nlp(compl.decode('utf-8'), disable=['ner', 'textcat'])
verb_s_texts = []
verb_s_texts = subjVerbObjToSentence(doc, verb_s_texts)
noun_s_texts = {}
noun_s_texts = adjNounToSentence(doc, noun_s_texts)
noun_s_texts = propnConjToSentence(doc, noun_s_texts)
print '----'
print verb_s_texts
print '----'
print noun_s_texts

simple_text = ''
last_found = 0
for sent in verb_s_texts:
    simple_text += sent
    st = sent.lower()
    for n in noun_s_texts.keys():
        if (st.find(n.lower()) >= 0):
            simple_text += noun_s_texts[n]
            noun_s_texts[n] = ''
            break
for n in noun_s_texts.keys():
    if len(noun_s_texts[n]) > 0:
        simple_text += noun_s_texts[n]
print '\n\ncomplex sentence: '
print doc.text
print 'simple_text_given: '
print sim
print 'simple_text_generated: '
print simple_text
#     print 'rejected sentences: '
#     print rejected

ouxilaries [is]
subjects [is, WazaNationalPark]
attributes [is, WazaNationalPark]
neg [is, WazaNationalPark]
dobjects [is, WazaNationalPark]
pobjects [is, WazaNationalPark, in, north, the]
pobjects [is, WazaNationalPark, in, north, the]
subjects [is, WazaNationalPark, in, north, the]
parent verb [is, WazaNationalPark, in, north, the]
----
[u'WazaNationalPark is in the north . ']
----
{}


complex sentence: 
WazaNationalPark is in the north .
simple_text_given: 
WazaNationalPark is a good place to see wildlife in Cameroon .
simple_text_generated: 
WazaNationalPark is in the north . 


# Read and preprocess data

In [7]:
## main1 ##
compl_to_sim = {}
compl_to_sim = readNewselaData(compl_to_sim)
compl_to_sim = readDBPediaData(compl_to_sim)
len(compl_to_sim)

8705

In [25]:
compl_to_sim = {}
with open("../dataset/texts_geougraphy.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        texts = line.split('\t')

        if len(texts) != 2:
            continue
        try:
            sim_u = texts[0].encode('utf-8')
            compl_u = texts[1].encode('utf-8')
        except:
            continue
        compl_to_sim[texts[0]] = texts[1]
            
            

In [26]:
len(compl_to_sim)

8395

In [13]:
random.choice(list(compl_to_sim.items()))

('Atlantic City International Airport can be found in Egg Harbor Township , New Jersey in the U.S.A.',
 'Atlantic City International Airport is located at Egg Harbor Township , New Jersey . Egg Harbor Township is a part of New Jersey . Egg Harbor Township is located in New Jersey , in the United States .')

In [49]:
all_com_words = {}
all_sim_words = {}
tokenized_texts = {}
for com, sim in compl_to_sim.iteritems():
    text = nlp(com.decode('utf-8'))
    text_tokens = []
    propn_text = ''
    for token in text:
        if (token.pos_ == 'PROPN'):
            propn_text += token.text
        elif len(propn_text) > 0:
            text_tokens.append(propn_text)
            propn_text = ''
        if token.pos_ not in ['SPACE', 'SYM', 'X', 'PROPN']:
            text_tokens.append(token.text)
                
    for tok in text_tokens:
        if not all_com_words.has_key(tok.lower()):
            all_com_words[tok.lower()] = 0
        all_com_words[tok.lower()] += 1
    
    com_text = ' '.join(text_tokens)

    
    text = nlp(sim.decode('utf-8'))
    text_tokens = []
    propn_text = ''
    for token in text:
        if (token.pos_ == 'PROPN'):
            propn_text += token.text
        elif len(propn_text) > 0:
            text_tokens.append(propn_text)
            propn_text = ''
        if token.pos_ not in ['SPACE', 'SYM', 'X', 'PROPN']:
            text_tokens.append(token.text)
                
    for tok in text_tokens:
        if not all_sim_words.has_key(tok.lower()):
            all_sim_words[tok.lower()] = 0
        all_sim_words[tok.lower()] += 1
    
    sim_text = ' '.join(text_tokens)
    
    tokenized_texts[com_text] = sim_text

In [40]:
len(tokenized_texts)

8325

In [50]:
random.choice(list(tokenized_texts.items()))

(u'The horns were carved into spoons and ladles , the hooves cooked to make glue .',
 u'They used buffalo horns to make spoons .')

In [None]:
train, test = splitTrainTest(compl_to_sim)
print len(train), len(test)

# Get some examples of baseline results

In [21]:
class MyText(object):
    def __init__(self, dataset):
        self.dataset = dataset
    
    def __iter__(self):
        for com, sim in self.dataset.iteritems():
            text = nlp((com + ' ' + sim).decode('utf-8'))
            for sentence in text.sents:
                propn_text = ''
                text_tokens = []
                for token in sentence:
                    if (token.pos_ == 'PROPN'):
                        propn_text += token.text
                    elif len(propn_text) > 0:
                        text_tokens.append(propn_text)
                        propn_text = ''
                    if token.pos_ not in ['SPACE', 'PUNCT', 'SYM', 'X', 'NUM', 'PROPN']:
                        text_tokens.append(token.text.lower())
                yield text_tokens

In [27]:
data = MyText(compl_to_sim)
model = gensim.models.Word2Vec(data, size=100, min_count=2, iter=10)

In [37]:
model.wv.most_similar(positive=["country"],   topn=20)

[(u'leader', 0.6677063703536987),
 (u'AsianAmericans', 0.6671854257583618),
 (u'group', 0.6590251922607422),
 (u'groups', 0.6352567076683044),
 (u'EthiopianBirr', 0.6279417872428894),
 (u'AfricanAmericans', 0.6198752522468567),
 (u'ethnic', 0.6174106597900391),
 (u'language', 0.6029496192932129),
 (u'capital', 0.5959588289260864),
 (u'state', 0.5826288461685181),
 (u'SouthAfrica', 0.5808447003364563),
 (u'leaders', 0.5779481530189514),
 (u'RioSolimoes', 0.5771121978759766),
 (u'Oregon', 0.5763739347457886),
 (u'region', 0.5747165679931641),
 (u'city', 0.5572177767753601),
 (u'English', 0.5509083867073059),
 (u'President', 0.5377963185310364),
 (u'Japan', 0.5365291833877563),
 (u'U.S.AsianAmericans', 0.5361120700836182)]