In [1]:
import random
import json
import re
import os
import bz2
import spacy
nlp = spacy.load('en', disable=['ner', 'textcat', 'depend'])

# Functions to read and prepare data

In [2]:
# extract complex and simple sentences and text category 
# from all data for one sentence in DBPedia dataset

def parseSentenceData(data):
    data = data.strip().split("\n\n")
    
    complexsentdata = data[0].strip().split("\n")
    complexid = int(complexsentdata[0].split("-")[1].strip())
    complexsent = complexsentdata[1].strip()

    cat = '' 
    mr_dict = {}
    # Collect all complex mrs
    for item in data[1:]:
        if re.match('COMPLEX-'+str(complexid)+':MR-[0-9]*\n', item):
            # print item
            mrdata = item.strip().split("\n")
            mrid = mrdata[0]
            mr = mrdata[1]
            if re.match('category=[a-zA-Z]* eid=Id[0-9]* size=[0-9]*', mr):
                cat = mr.strip().split(' ')[0].split('=')[1]
            mr_dict[mrid] = [mr, {}]
    
    simpsents = {}
    for item in data[1:]:
        if re.match('COMPLEX-'+str(complexid)+':MR-[0-9]*:SIMPLE-[0-9]*\n', item):
            
            mrid = ":".join(item.strip().split("\n")[0].split(":")[:2])

            sents = (" ".join(item.strip().split("\n")[1:])).strip()
            
            if sents not in simpsents:
                simpsents[sents] = 1

            if sents not in mr_dict[mrid][1]:
                mr_dict[mrid][1][sents] = 1

    return complexsent, simpsents, cat

In [3]:
# remove some frequent garbage from sentences 
def preprocessSentence(text):
    text = re.sub('-LRB-(.*?)-RRB-', '', text)
    text = re.sub('^\"', '', text)
    text = re.sub('\"$', '', text)
    return text

In [9]:
# Read DBPedia dataset
def readDBPediaData(compl_to_sim):
    filename = '../dataset/complexsimple.txt.bz2'
    with bz2.BZ2File(filename, "r") as content:         
        sentdata = []
        i = 0
        close_to_geo_categories = ['Monument', 'City', 'Airport', 'Building']

        for line in content:
            if len(sentdata) == 0:
                sentdata.append(line)
            else:
                if re.match('COMPLEX-[0-9]*\n', line):
                    complexsent, simpsents, cat = parseSentenceData("".join(sentdata))
                    simple_sen = simpsents.keys()[0]
                    
                    if (len(simple_sen) > 0):
                        compl_to_sim[preprocessSentence(complexsent)] = preprocessSentence(simple_sen)
                    sentdata = [line]
                else:
                    sentdata.append(line)
    return compl_to_sim

In [10]:
# read Newsela dataset
def readNewselaData(compl_to_sim):
    with bz2.BZ2File('../dataset/clear_newsela.txt.bz2', "r") as content:         
        for line in content:
            texts = line.split('\t')

            if len(texts) != 3:
#                 print 'wrong line: ', len(texts), line
                continue
            if (len(texts[0]) < 10) or (len(texts[1]) < 10):
#                 print 'empty line: ', line
                continue
                
            compl_to_sim[preprocessSentence(texts[0])] = preprocessSentence(texts[1])
    return compl_to_sim

In [43]:
def getDependencyRepresentation(compl_to_sim_data):
    train = {}
    test = {}
    i = 1

    skip_pos = ['PUNCT', 'SPACE', 'X']
    for compl, sim in compl_to_sim_data.iteritems():
        try:
            doc_c = nlp(compl.decode('utf-8'))
            doc_s = nlp(sim.decode('utf-8'))
        except:
            continue
        
        len_c = len(list(x for x in doc_c))
        len_s = len(list(x for x in doc_s))
        if (len_c > 50) or (len_s > 70) or (len_s <= len_c):
            continue 
            
        c_seq = [x.pos_ + 'O' + x.dep_ for x in doc_c if x.pos_ not in skip_pos]
        s_seq = [x.pos_ + 'O' + x.dep_ for x in doc_s if x.pos_ not in skip_pos]
        
        if (len(c_seq) <= 1) or (len(s_seq) <= 1):
            continue
            
        r = random.random()
        if r < 0.8:
            train[i] = {}
            train[i]["compl"] = c_seq
            train[i]["sim"] = s_seq
            train[i]["com_text"] = compl
            train[i]["sim_text"] = sim
        else:
            test[i] = {}
            test[i]["compl"] = c_seq
            test[i]["sim"] = s_seq
            test[i]["com_text"] = compl
            test[i]["sim_text"] = sim
            
        i += 1
    return train, test
    

# Read and preprocess data

In [12]:
## main1 ##
compl_to_sim = {}
compl_to_sim = readNewselaData(compl_to_sim)
compl_to_sim = readDBPediaData(compl_to_sim)


In [23]:
train, test = getDependencyRepresentation(compl_to_sim)
print len(train), len(test)

17867 4441


In [24]:
print train[2]
    

{'sim_text': 'The leader is in charge of the government .', 'sim': [u'DETOOOdet', u'NOUNOOOnsubj', u'VERBOOOROOT', u'ADPOOOprep', u'NOUNOOOpobj', u'ADPOOOprep', u'DETOOOdet', u'NOUNOOOpobj'], 'compl': [u'DETOOOdet', u'ADJOOOamod', u'NOUNOOOnsubj', u'VERBOOOROOT', u'NOUNOOOattr', u'ADPOOOprep', u'NOUNOOOpobj'], 'com_text': 'The prime minister is head of government . '}


In [15]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [25]:
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

In [26]:
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

In [27]:
def encode_output(sequences, vocab_size):
	ylist = list()
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y

In [28]:
compl_tokenizer = create_tokenizer([' '.join(x["compl"]) for i, x in train.iteritems()])
compl_vocab_size = len(compl_tokenizer.word_index) + 1
compl_length = max(len(x["compl"]) for i, x in train.iteritems())
print('English Vocabulary Size: %d' % compl_vocab_size)
print('English Max Length: %d' % (compl_length))

sim_tokenizer = create_tokenizer([' '.join(x["sim"]) for i, x in train.iteritems()])
sim_vocab_size = len(sim_tokenizer.word_index) + 1
sim_length = max(len(x["sim"]) for i, x in train.iteritems())
print('English Vocabulary Size: %d' % sim_vocab_size)
print('English Max Length: %d' % (sim_length))

English Vocabulary Size: 279
English Max Length: 46
English Vocabulary Size: 244
English Max Length: 94


In [29]:
print(sim_tokenizer.word_index)

{u'cconjooopreconj': 135, u'verbooonummod': 185, u'advoooexpl': 60, u'verboooaux': 30, u'detoooattr': 118, u'adpoooaux': 170, u'verbooooprd': 173, u'nounooonsubj': 8, u'adjooonsubjpass': 57, u'verboooroot': 3, u'adjooopobj': 58, u'adpoooagent': 21, u'advooooprd': 234, u'propnooodative': 155, u'adjoooappos': 123, u'pronooonsubj': 28, u'advooopcomp': 124, u'adjooomark': 214, u'intjooocompound': 150, u'symooopunct': 41, u'verbooocsubj': 74, u'nounoooattr': 10, u'nounoooquantmod': 230, u'propnoooacomp': 88, u'numooonsubj': 52, u'partoooamod': 226, u'detoooquantmod': 138, u'detooodobj': 119, u'numooodobj': 98, u'detooonummod': 191, u'adjooopcomp': 222, u'verboooauxpass': 9, u'partooocase': 20, u'nounooointj': 224, u'detooodet': 4, u'symooocc': 206, u'adjoooccomp': 114, u'detooodep': 139, u'adjooooprd': 100, u'adpoooprep': 2, u'nounoooxcomp': 187, u'detoooconj': 216, u'propnoooprep': 75, u'advoooneg': 66, u'nounooocsubj': 162, u'nounooonpadvmod': 44, u'adjoooroot': 94, u'nounooonummod': 195,

In [30]:
trainX = encode_sequences(compl_tokenizer, compl_length, [' '.join(x["compl"]) for i, x in train.iteritems()])
trainY = encode_sequences(sim_tokenizer, sim_length, [' '.join(x["sim"]) for i, x in train.iteritems()])
trainY = encode_output(trainY, sim_vocab_size)

In [31]:
testX = encode_sequences(compl_tokenizer, compl_length, [' '.join(x["compl"]) for i, x in test.iteritems()])
testY = encode_sequences(sim_tokenizer, sim_length, [' '.join(x["sim"]) for i, x in test.iteritems()])
testY_rem = testY
testY = encode_output(testY, sim_vocab_size)

In [25]:
print len(trainX), len(trainY[0]), trainY[555]

5728 89 [[0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


In [32]:
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	model = Sequential()
	model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
	model.add(LSTM(n_units))
	model.add(RepeatVector(tar_timesteps))
	model.add(LSTM(n_units, return_sequences=True))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	return model

In [33]:
model = define_model(compl_vocab_size, sim_vocab_size, compl_length, sim_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')

In [34]:
model.fit(trainX, trainY, epochs=20, batch_size=98, validation_data=(testX, testY))

Train on 17867 samples, validate on 4441 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7faf63fad790>

In [50]:
print len(trainX), len(trainX[0])

5652 100


In [35]:
translation = model.predict(testX)

In [36]:
print len(translation)

4441


In [53]:
print translation[0]

[[1.20282180e-04 3.84107083e-01 6.12753816e-03 ... 7.44440456e-07
  7.95420419e-06 9.38735752e-07]
 [5.77302271e-05 6.03535295e-01 1.54411513e-02 ... 2.66166637e-07
  4.01458919e-06 2.58714238e-07]
 [4.74725275e-05 5.34867585e-01 7.53834397e-02 ... 8.87111256e-08
  2.98961118e-06 7.85386334e-08]
 ...
 [9.98717904e-01 2.24181087e-04 1.09373803e-04 ... 3.07271897e-08
  8.07925904e-09 1.03261018e-08]
 [9.98793840e-01 2.08598518e-04 1.04964172e-04 ... 2.88884845e-08
  7.64654384e-09 9.62586277e-09]
 [9.98854518e-01 1.96412133e-04 1.01571386e-04 ... 2.73891700e-08
  7.29256877e-09 9.06798014e-09]]


In [37]:
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

In [38]:
from numpy import argmax

In [39]:
def predict_sequence(integers, tokenizer):
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

In [42]:
ff = 999
# print len()
print predict_sequence(testX[ff], compl_tokenizer)
print ''
print predict_sequence(testY_rem[ff], sim_tokenizer)
print ''
# print predict_sequence(testY[ff], sim_tokenizer)
integers = [argmax(vector) for vector in translation[ff]]
generated = predict_sequence(integers, sim_tokenizer)
print generated

detooodet nounooonsubj adpoooprep propnooopobj propnoooappos verboooroot propnooocompound propnoooattr adjooonsubjpass verboooauxpass verbooorelcl adpoooagent detooodet propnooocompound propnooocompound propnooopobj verboooacl adpoooprep detooodet propnooocompound propnooocompound propnooopobj advoooadvmod propnooonsubjpass verboooauxpass verbooorelcl

propnooocompound propnooonsubj verboooroot detooodet nounoooattr adpoooprep propnooopobj propnoooappos propnooonsubjpass adjooonsubj verbooorelcl nounoooattr adpoooprep propnooopobj verboooauxpass verboooroot adpoooagent detooodet propnooocompound propnooocompound propnooopobj detooodet propnooocompound propnooocompound propnooonsubj verboooroot adpoooprep detooodet propnooocompound propnooocompound propnooopobj propnooonsubjpass verboooauxpass verboooroot adpoooprep propnooopobj

detooodet propnooocompound verboooroot adpoooprep verboooroot propnooocompound propnooocompound propnooocompound propnooocompound propnooocompound propnooocomp