In [1]:
import random
import json
import re
import os
import bz2
import spacy



In [33]:
nlp = spacy.load('en_core_web_lg', disable=['ner', 'textcat', 'depend'])

# Functions to read and prepare data

In [3]:
# remove some frequent garbage from sentences 
def preprocessSentence(text):
    text = re.sub('-LRB-(.*?)-RRB-', '', text)
    text = re.sub('-RRB-', '', text)
    text = re.sub('^\"', '', text)
    text = re.sub('\"$', '', text)
    text = re.sub('\\n', '', text)
    text = re.sub('\\r', '', text)
    text = re.sub('\\t', '', text)
    text = re.sub('\`', '', text)
    text = re.sub('\'\'', '', text)
    return text

In [4]:
# read Newsela dataset
def readNewselaData(compl_to_sim):
    with bz2.BZ2File('../dataset/clear_newsela.txt.bz2', "r") as content:         
        for line in content:
            texts = line.split('\t')

            if len(texts) != 3:
#                 print 'wrong line: ', len(texts), line
                continue
            if (len(texts[0]) < 10) or (len(texts[1]) < 10):
#                 print 'empty line: ', line
                continue
                
            compl_to_sim[preprocessSentence(texts[0])] = preprocessSentence(texts[1])
    return compl_to_sim

In [5]:
def readWikiData(compl_to_sim):
    complex_sentences = {}
    with bz2.BZ2File('../dataset/wiki.bz2', "r") as content:  
        i = 1
        for line in content:
            complex_sentences[i] = preprocessSentence(line)
            i += 1
    i = 1
    with bz2.BZ2File('../dataset/simple_wiki.bz2', "r") as simple:
        for line in simple:
            prep_line = preprocessSentence(line)

            try:
                if (complex_sentences[i] != prep_line) or (len(complex_sentences[i]) < 100):
                    compl_to_sim[complex_sentences[i]] = prep_line
            except:
                continue
            i += 1
    return compl_to_sim

In [6]:
def readPWKPData(compl_to_sim):
    complex_sentences = {}
    with bz2.BZ2File('../dataset/PWKP_108016.bz2', "r") as content:  
        simple = False
        simple_sent = ''
        compls_sent = ''
        for line in content:
            if len(line) < 5:
                simple = False
                if (len(simple_sent) > 10) and (len(compls_sent) > 10):
                    compl_to_sim[preprocessSentence(compls_sent)] = preprocessSentence(simple_sent)
                simple_sent = ''
                compls_sent = ''
            elif simple == False:
                compls_sent += line + ' '
                simple = True
            else:
                simple_sent += line + ' '
    return compl_to_sim

In [17]:
def clearSentence(text, vocab, voc_freq, re_print):
    
    text = text.decode('UTF-8')
    doc = nlp(text)
    text_tokens = []
    prev_pos = ''
    propn_text = ''
    for x in doc:
        if (x.pos_ == 'PROPN'):
            propn_text += x.text
        elif len(propn_text) > 0:
            text_tokens.append(propn_text)
            propn_text = ''
#         elif (x.lemma_ == '-PRON-'):
#             text_tokens.append(x.text.lower())
        if x.pos_ not in ['SPACE', 'PUNCT', 'SYM', 'X', 'NUM', 'PROPN']:
            text_tokens.append(x.text.lower())
            
        
    text_tokens = [re_print.sub('', w) for w in text_tokens]
    tokens_count = len(text_tokens)
    if (tokens_count > 12):
        return ' ', tokens_count, vocab, voc_freq
    
    for word in text_tokens:
        if word in vocab:
            continue
        elif not voc_freq.has_key(word):
            voc_freq[word] = 1
        elif voc_freq[word] == 3:
            vocab.append(word)
        else:
            voc_freq[word] += 1     
    
    return ' '.join(text_tokens), tokens_count, vocab, voc_freq
    

In [18]:
def clearLong(dataset):
    
    new_dataset = {}
    vocab = []
    voc_freq = {}
    i = 0
    for key, val in dataset.iteritems():
        if i % 1000 == 0:
            print(i)
        i += 1
        key = key.replace('  ', ' ')
        key_tok = key.split(' ')
        val = val.replace('  ', ' ')
        val_tok = val.split(' ')        

        max_encoder_seq_length = 12
        max_decoder_seq_length = 12
        
        if (len(val_tok) > max_decoder_seq_length) or (len(key_tok) > max_encoder_seq_length):
            continue
    
        for word in val_tok:
            if word in vocab:
                continue
            elif not voc_freq.has_key(word):
                voc_freq[word] = 1
            elif voc_freq[word] == 3:
                vocab.append(word)
            else:
                voc_freq[word] += 1     

        for word in key_tok:
            if word in vocab:
                continue
            elif not voc_freq.has_key(word):
                voc_freq[word] = 1
            elif voc_freq[word] == 3:
                vocab.append(word)
            else:
                voc_freq[word] += 1
                
        new_dataset[' '.join(key_tok)] = ' '.join(val_tok)

    vocab.append('start_')
    vocab.append('_end')
    vocab.append('UNkN')

    return new_dataset, vocab, max_encoder_seq_length, max_decoder_seq_length

    

In [19]:
# split data to train and test set
def splitTrainTest(compl_to_sim):
    train = {}
    test = {}
    vocab = []
    max_com_length = 0
    max_sim_length = 0
    voc_freq = {}
    re_print = re.compile('[^%s]' % re.escape(string.printable))

    ii = 0
    for compl, sim in compl_to_sim.iteritems():
        if (ii % 1000 == 1):
            print(ii)
        ii += 1
        try:
            sim_u = sim.encode('utf-8')
            compl_u = compl.encode('utf-8')
        except:
            continue
#         com, s = replaceProperNouns(compl, sim)
#         com, s = compl, sim
        com, lc, vocab, voc_freq = clearSentence(compl, vocab, voc_freq, re_print)
        if (lc > 12) or (lc < 3):
            continue
        s, ls, vocab, voc_freq = clearSentence(sim, vocab, voc_freq, re_print)
        if (ls > 12) or (ls < 3): 
            continue
        
        if (max_com_length < lc):
            max_com_length = lc
        if (max_sim_length < ls):
            max_sim_length = ls
            
        r = random.random()
        if r < 0.8:
            train[com] = s
        else:
            test[com] = s
    return train, test, vocab, max_com_length, max_sim_length

# Read and preprocess data

In [10]:
compl_to_sim = {}

In [11]:
compl_to_sim = readNewselaData(compl_to_sim)

In [12]:
compl_to_sim = readWikiData(compl_to_sim)

In [13]:
compl_to_sim = readPWKPData(compl_to_sim)

In [14]:
len(compl_to_sim)

234022

In [15]:
random.choice(list(compl_to_sim.items()))

('This combination of propellants is sometimes still used in torpedoes. ',
 'This combination of propellants is still used in torpedoes. ')

In [16]:
import string

In [20]:
# train, test = splitTrainTest(compl_to_sim)
# print len(train), len(test)

train, test, tokens, max_encoder_seq_length, max_decoder_seq_length = splitTrainTest(compl_to_sim)
num_tokens = len(tokens)


1
1001
2001
3001
4001
5001
6001
7001
8001
9001
10001
11001
12001
13001
14001
15001
16001
17001
18001
19001
20001
21001
22001
23001
24001
25001
26001
27001
28001
29001
30001
31001
32001
33001
34001
35001
36001
37001
38001
39001
40001
41001
42001
43001
44001
45001
46001
47001
48001
49001
50001
51001
52001
53001
54001
55001
56001
57001
58001
59001
60001
61001
62001
63001
64001
65001
66001
67001
68001
69001
70001
71001
72001
73001
74001
75001
76001
77001
78001
79001
80001
81001
82001
83001
84001
85001
86001
87001
88001
89001
90001
91001
92001
93001
94001
95001
96001
97001
98001
99001
100001
101001
102001
103001
104001
105001
106001
107001
108001
109001
110001
111001
112001
113001
114001
115001
116001
117001
118001
119001
120001
121001
122001
123001
124001
125001
126001
127001
128001
129001
130001
131001
132001
133001
134001
135001
136001
137001
138001
139001
140001
141001
142001
143001
144001
145001
146001
147001
148001
149001
150001
151001
152001
153001
154001
155001
156001
157001
158001


In [21]:
len(train)

38623

In [22]:
tokens.append('start_')
tokens.append('_end')
tokens.append('UNkN')
num_tokens = len(tokens)

In [226]:
train, tokens, max_encoder_seq_length, max_decoder_seq_length = clearLong(train)
num_tokens = len(tokens)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000


In [23]:
num_tokens = len(tokens)

In [2]:
train = {}
with open("../dataset/train_12.txt", "r") as f:
    lines = f.readlines()
    for l in lines:
        parts = l.split('\t')
        if len(parts) != 2:
            continue
        train[parts[0]] = parts[1]

In [3]:
len(train)

38546

In [4]:
tokens = []
with open("../dataset/vocab_12.txt", "r") as f:
    lines = f.readlines()
    for l in lines:
        if len(l) > 0:
            tokens.append(l)
num_tokens = len(tokens)

In [24]:
print('Number of samples:', len(train))
print('Number of unique tokens:', num_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

('Number of samples:', 38623)
('Number of unique tokens:', 20672)
('Max sequence length for inputs:', 12)
('Max sequence length for outputs:', 12)


In [25]:
with open("../dataset/train_12_3.txt", "w") as f:
    for com, sim in train.iteritems():
        f.write(com + '\t' + sim + '\n')

with open("../dataset/test_12_3.txt", "w") as f:
    for com, sim in test.iteritems():
        f.write(com + '\t' + sim + '\n')
        
with open("../dataset/vocab_12_3.txt", "w") as f:
    for tok in tokens:
        f.write(tok + '\n')


In [26]:
input_texts = []
target_texts = []
max_encoder_seq_length = 0
max_decoder_seq_length = 0
for com, sim in train.iteritems():
    input_texts.append(com)
    sim = 'start_ ' + sim + ' _end'
    target_texts.append(sim)
    if (len(com.split(' ')) > max_encoder_seq_length):
        max_encoder_seq_length = len(com.split(' '))
    if (len(sim.split(' ')) > max_decoder_seq_length):
        max_decoder_seq_length = len(sim.split(' '))


In [27]:
random.choice(zip(input_texts, target_texts))    

(u'in the church was destroyed in a fire',
 u'start_ in the church was destroyed in a fire _end')

In [28]:
from __future__ import print_function

from keras.models import Model
from keras.layers import Input, LSTM, Dense
import numpy as np

Using TensorFlow backend.


In [29]:
batch_size = 80  # Batch size for training.
epochs = 5  # Number of epochs to train for. ___100___
latent_dim = 256  # Latent dimensionality of the encoding space.
num_samples = 10000  # Number of samples to train on.

In [30]:
token_index = dict(
    [(char, i) for i, char in enumerate(tokens)])

In [31]:
len(train)*17*num_tokens

13573049152

In [32]:
encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_tokens),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_tokens),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_tokens),
    dtype='float32')

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text.split()):
        try:
            encoder_input_data[i, t, token_index[char]] = 1.
        except:
            encoder_input_data[i, t, token_index['UNkN']] = 1.
                
    for t, char in enumerate(target_text.split()):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        try:
            decoder_input_data[i, t, token_index[char]] = 1.
        except:
            decoder_input_data[i, t, token_index['UNkN']] = 1.
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            try:
                decoder_target_data[i, t - 1, token_index[char]] = 1.
            except:
                decoder_target_data[i, t - 1, token_index['UNkN']] = 1

MemoryError: 

In [24]:
encoder_input_data.shape, decoder_input_data.shape, decoder_target_data.shape

((38546, 12, 9253), (38546, 14, 9253), (38546, 14, 9253))

In [25]:
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, num_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, num_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [26]:
# Run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

In [27]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fb7025e8ed0>

In [30]:
# Save model
model.save('s2s12_PrPR.h5')

ImportError: `save_model` requires h5py.

In [None]:
# Next: inference mode (sampling).
# Here's the drill:
# 1) encode input and retrieve initial decoder state
# 2) run one step of decoder with this initial state
# and a "start of sequence" token as target.
# Output will be the next target token
# 3) Repeat with the current target token and current states

In [31]:
# Define sampling models
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_char_index = dict(
    (i, char) for char, i in token_index.items())


In [34]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, token_index['start_']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_char_index[sampled_token_index]
        decoded_sentence += sampled_char + ' '

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [35]:
# encoder_input_data_test = np.zeros(
#     (len(test), max_encoder_seq_length, num_encoder_tokens),
#     dtype='float32')

# for i, (input_text, target_text) in enumerate(test):
#     for t, char in enumerate(input_text.split()):
#         encoder_input_data_test[i, t, input_token_index[char]] = 1.


for seq_index in range(100):
# for compl, sim in test.iteritems():
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)

-
Input sentence: there are categories of education and training that a mb will undergo
Decoded sentence: UNkN UNkN UNkN 
-
Input sentence: DedicationMarch no was completed in November and published in
Decoded sentence: UNkN UNkN UNkN 
-
Input sentence: FarbenLehre is a polish punk band formed in by WojciechWojda
Decoded sentence: UNkN UNkN UNkN 
-
Input sentence: the Futureheads released their self titled debut album in September on recordings
Decoded sentence: UNkN UNkN UNkN 
-
Input sentence: the url of \/ was mistakenly added to the malware patterns file
Decoded sentence: UNkN UNkN UNkN 
-
Input sentence: the ShihTzu is a breed of dog weighing with long silky hair
Decoded sentence: UNkN UNkN UNkN 
-
Input sentence: fighting in the civil war he won many medals
Decoded sentence: UNkN UNkN UNkN 
-
Input sentence: NASA 's JIMO was canceled in
Decoded sentence: UNkN UNkN UNkN 
-
Input sentence: refusing to speak to some segregated auditoriums while speaking to others
Decoded sentence: U

-
Input sentence: they are commonly found on food labels throughout the EuropeanUnion
Decoded sentence: UNkN UNkN UNkN 
-
Input sentence: BethnalGreen railway station is in the LondonBorough of TowerHamlets in east London
Decoded sentence: UNkN UNkN UNkN 
-
Input sentence: Stratford did as well in the face of pressure from the MapleLeafs
Decoded sentence: UNkN UNkN UNkN 
-
Input sentence: on March a volcano erupted on RitterIsland causing a megatsunami
Decoded sentence: UNkN UNkN UNkN 
-
Input sentence: he attended the University of Nebraska at Kearney and graduated in
Decoded sentence: UNkN UNkN UNkN 
-
Input sentence: DawnFraserAO MBE born September is an australian champion swimmer
Decoded sentence: UNkN UNkN UNkN 
-
Input sentence: the NightSanta went Crazy is an original song by WeirdAl Yankovic
Decoded sentence: UNkN UNkN UNkN 
-
Input sentence: Young was also elected to the BaseballHall of Fame in
Decoded sentence: UNkN UNkN UNkN 
-
Input sentence: however Grover meets a monster