In [1]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import plot_model

from network import create_model, find_checkpoint_file, train
from preporcessing import split_text_to_array_of_sentences
from utils import preprocess_data, load_train_data, load_from_file

Using TensorFlow backend.


# Import configuration for this experiment

In [2]:
# available configurations: long_train_conf, short_conf, test_conf
from config import long_train_conf as conf

# Load weights if want to use previously calculated weights 
# and your model didn't changed (except weights)
LOAD_WEIGHTS_FROM_CHECKPOINT = True
LOAD_METHOD = load_from_file

# drive is None when loading data from filesystem
drive = None

# Configuration for google colab (skip if not running in Colab)

In [None]:
# Install the PyDrive wrapper & import libraries.
# This only needs to be done once per notebook.
!pip install -U -q PyDrive

from colab_tools import g_authenticate, check_gpu, load_from_gdrive

LOAD_METHOD = load_from_gdrive
drive = g_authenticate()

# check if GPU is available:
check_gpu

# Loading input sequences, output sequences

In [4]:
x, y = load_train_data(file_source_lang='vocab_en',
                       file_target_lang='vocab_fr',
                       load_method=load_from_file,
                       drive
                       )

- Loading training data


# Splitting text to sentences, reversing order in input sequences

In [5]:
x, y = split_text_to_array_of_sentences(x, y,
                                        conf['MAX_LEN'],
                                        conf['NUM_SENT']
                                        )

# Create word-to-index and index-to-word mappings, map the data

In [6]:
# process data in source language
x_sent_array, x_vocab_len, x_word_to_idx, x_idx_to_word = preprocess_data(
    x, conf['VOCAB_SIZE']
)

# process data in target language
y_sent_array, y_vocab_len, y_word_to_idx, y_idx_to_word = preprocess_data(
    y, conf['VOCAB_SIZE']
)

- creating vocabulary
- creating mappings
- converting words to indices
- creating vocabulary
- creating mappings
- converting words to indices


# Find the length of the longest sequence, pad with zeros

In [7]:
# find the length of the longest sequence in source and target sentences
x_max_len = max([len(sentence) for sentence in x_sent_array])
y_max_len = max([len(sentence) for sentence in y_sent_array])

# Padding zeros to make all sequences have a same length as the longest one
print('Zero padding...')
X = pad_sequences(x_sent_array, maxlen=x_max_len, dtype='int32')
y = pad_sequences(y_sent_array, maxlen=y_max_len, dtype='int32')

Zero padding...


# Create network model

In [8]:
# Creating the network model
model = create_model(x_vocab_len, x_max_len,
                     y_vocab_len, y_max_len,
                     conf['HIDDEN_DIM'], conf['LAYER_NUM'])
plot_model(model, to_file='model.png')


In [10]:
# Finding trained weights of previous epoch if any
saved_weights = find_checkpoint_file('.')

# if you don't want to use previously calculate weights set `saved_weights` to [] as below.

saved_weights = []

train(X, y, y_word_to_idx, y_max_len, saved_weights, model, conf)

[INFO] Training model: epoch 1th 0/137860 samples
Epoch 1/1
 - 5s - loss: 5.5111 - acc: 0.3607
[INFO] Training model: epoch 1th 1000/137860 samples
Epoch 1/1
 - 2s - loss: 5.4291 - acc: 0.4030
[INFO] Training model: epoch 1th 2000/137860 samples
Epoch 1/1
 - 2s - loss: 5.2074 - acc: 0.4116
[INFO] Training model: epoch 1th 3000/137860 samples
Epoch 1/1


KeyboardInterrupt: 