In [57]:
import nltk
import numpy as np
import tensorflow as tf
from tensorflow.python import keras
from tensorflow.python.keras import preprocessing
from tensorflow.python.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import GRU
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import RepeatVector
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [58]:
nltk.download('comtrans')

[nltk_data] Downloading package comtrans to /home/hafsa/nltk_data...
[nltk_data]   Package comtrans is already up-to-date!


True

In [59]:
from nltk.corpus import comtrans
print(comtrans.aligned_sents('alignment-en-fr.txt')[0])

<AlignedSent: 'Resumption of the se...' -> 'Reprise de la sessio...'>


In [60]:
len(comtrans.aligned_sents('alignment-en-fr.txt'))

33334

In [61]:
# parse the data into x (English sentences) and Y (French sentences)
sents = comtrans.aligned_sents('alignment-en-fr.txt')

x = []
Y = []
n = 0

for i in sents:
    eng = sents[n].words
    x.append(eng)
    
    fr = sents[n].mots
    Y.append(fr)
    
    n += 1
    if n == 33334:
        break

In [62]:
print(x[0])

['Resumption', 'of', 'the', 'session']


In [63]:
print(Y[0])

['Reprise', 'de', 'la', 'session']


In [64]:
# create vocabulary sets

set_y = set()
for i in Y:
    for char in i:
        set_y.add(char)

print(len(set_y))

set_x = set()
for i in x:
    for char in i:
        set_x.add(char)

print(len(set_x))

23952
18762


### Tokenize and Pad the data

In [65]:
def tokenize(x):
    tokenized_x = Tokenizer(char_level = False)
    tokenized_x.fit_on_texts(x)
    return tokenized_x.texts_to_sequences(x), tokenized_x

In [66]:
def pad(x, length=None):
    if length is None:
        length = max([len(sentence) for sentence in x])
        return pad_sequences(x, maxlen = length, padding = 'post')

In [78]:
def preprocess_text(x, y):
    
    padded_x, tokenized_x = tokenize(x)
    padded_y, tokenized_y = tokenize(y)
    padded_x = pad(padded_x)
    padded_y = pad(padded_y)

    # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    padded_y = padded_y.reshape(*padded_y.shape, 1)
    return padded_x, padded_y, tokenized_x, tokenized_y

padded_x, padded_y, tokenized_x, tokenized_y =\
preprocess_text(x, Y)
    
max_english_sequence_length = padded_x.shape[1]
max_french_sequence_length = padded_y.shape[1]
english_vocab_size = len(tokenized_x.word_index)+1
french_vocab_size = len(tokenized_y.word_index)+1

print(padded_x[0])
print(padded_x.shape)
print(padded_y.shape)
print('Data Preprocessed')
print("Max English sentence length:", max_english_sequence_length)
print("Max French sentence length:", max_french_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("French vocabulary size:", french_vocab_size)

[3328    5    1 1019    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0]
(33334, 40)
(33334, 40, 1)
Data Preprocessed
Max English sentence length: 40
Max French sentence length: 40
English vocabulary size: 17076
French vocabulary size: 22820


In [79]:
def logits_to_text(logits, tokenizer):
    
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'
    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

print('`logits_to_text` function loaded.')

`logits_to_text` function loaded.


In [80]:
# define function for simple RNN model

def simple_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    """
    Build and train a basic RNN on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param french_vocab_size: Number of unique French words in the dataset
    :return: Keras model built, but not trained
    """
    # Hyperparameters
    learning_rate = 0.005
    
    # TODO: Build the layers
    model = Sequential()
    model.add(GRU(256, input_shape=input_shape[1:], return_sequences=True))
    model.add(TimeDistributed(Dense(1024, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax'))) 

    # Compile model
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    return model

#tests.test_simple_model(simple_model)

# Reshaping the input to work with a basic RNN
#max_french_sequence_length = padded_y.shape[1]

In [81]:
print(max_french_sequence_length)
tmp = pad(padded_x)
tmp_x = tmp.reshape((-1, padded_y.shape[-2], 1))

40


In [82]:
# Train the neural network

simple_rnn_model = simple_model(
    tmp_x.shape,
    max_french_sequence_length,
    english_vocab_size,
    french_vocab_size)
simple_rnn_model.fit(tmp_x, padded_y, batch_size=1024, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7faeb202a070>