In [9]:
from tensorflow.keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, LSTM, Multiply
from tensorflow.keras.layers import RepeatVector, Dense, Activation, Lambda, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.initializers import Constant
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model, Model
import tensorflow.keras.backend as K
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
import numpy as np
import pandas as pd

import io
import spacy

import string
import regex as re

import warnings 
warnings.filterwarnings('ignore')

In [2]:
!python -m spacy download en_core_web_lg
import en_core_web_lg

!python -m spacy download de_core_news_sm
import de_core_news_sm


Collecting en-core-web-lg==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.4.0/en_core_web_lg-3.4.0-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:03[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
Collecting de-core-news-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.4.0/de_core_news_sm-3.4.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')


In [3]:
df_en_de = pd.read_table('deu-eng/deu.txt', names=['eng', 'deu', 'attr'])

In [4]:
df_en_de = df_en_de.drop('attr',axis = 1).rename(columns = {'eng':'english', 'deu':'german'})

In [5]:
df_en_de

Unnamed: 0,english,german
0,Go.,Geh.
1,Hi.,Hallo!
2,Hi.,Grüß Gott!
3,Run!,Lauf!
4,Run.,Lauf!
...,...,...
251715,If someone who doesn't know your background sa...,"Wenn jemand Fremdes dir sagt, dass du dich wie..."
251716,If someone who doesn't know your background sa...,"Wenn jemand, der nicht weiß, woher man kommt, ..."
251717,It may be impossible to get a completely error...,"Es ist wohl unmöglich, einen vollkommen fehler..."
251718,I know that adding sentences only in your nati...,"Ich weiß wohl, dass das ausschließliche Beitra..."


In [10]:
# Lowercase all characters
df_en_de['english'] = df_en_de['english'].apply(lambda x: x.lower())
df_en_de['german'] = df_en_de['german'].apply(lambda x: x.lower())

# Remove quotes
df_en_de['english'] = df_en_de['english'].apply(lambda x: re.sub("'", '', x))
df_en_de['german'] = df_en_de['german'].apply(lambda x: re.sub("'", '', x))

# Set of all special characters
exclude = set(string.punctuation) 

# Remove all the special characters
df_en_de['english'] = df_en_de['english'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
df_en_de['german']=df_en_de['german'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

# Add start and end tokens to target sequences
df_en_de['german'] = df_en_de['german'].apply(lambda x : 'START_ '+ x + ' _END')



In [11]:
df_en_de.head()

Unnamed: 0,english,german
0,go,START_ geh _END
1,hi,START_ hallo _END
2,hi,START_ grüß gott _END
3,run,START_ lauf _END
4,run,START_ lauf _END


In [12]:
#rename dataframe for convenience
pairs = df_en_de

In [14]:
max_len = 10

pairs = df_en_de
pairs['english_length'] = pairs['english'].apply(lambda x: len(x.split(' ')))
pairs['german_length'] = pairs['german'].apply(lambda x: len(x.split(' ')))
print(len(pairs))
pairs = pairs[pairs['english_length'] <= max_len]
pairs = pairs[pairs['german_length'] <= max_len]
print(len(pairs))
pairs = pairs.sample(frac = 0.1, random_state = 1)
print(len(pairs))

251720
209317
20932


In [17]:
pairs['english']

138416             i guess ill have to stay home
133377             tom gave some old coins to me
46697                       i once lived in rome
19545                           she died from tb
195975    the mountain peak is covered with snow
                           ...                  
77570                    tom picked up the coins
18261                           i swim regularly
42769                         toms very effusive
86615                   tom has gone for the day
102519                 ill lend this book to you
Name: english, Length: 20932, dtype: object

In [56]:
text_source = pairs['english']
text_target = pairs['german']

In [57]:
nlp_source = en_core_web_lg.load()
nlp_target = de_core_news_sm.load()

In [58]:
Vectorizer_source = TextVectorization()
Vectorizer_target = TextVectorization()

Vectorizer_source.adapt(text_source)
Vectorizer_target.adapt(text_target)

vocab_source = Vectorizer_source.get_vocabulary()
vocab_target = Vectorizer_target.get_vocabulary()

In [59]:
vocab_source = [str(word) for word in vocab_source]
vocab_target = [str(word) for word in vocab_target]

In [60]:
vocab_source.remove('')
vocab_target.remove('')

In [63]:
len(vocab_source), len(vocab_target)

(6450, 10076)

In [None]:
#generate the embedding matrix for source vocab
num_tokens_source = len(vocab_source)
embedding_dim_source = len(nlp_source('The').vector)
embedding_matrix_source = np.zeros((num_tokens_source, embedding_dim_source))
for i, word in enumerate(vocab_source):
    embedding_matrix_source[i] = nlp_source(word).vector

# generate the embedding matrix for target vocab
num_tokens_target = len(vocab_target)
embedding_dim_target = len(nlp_target('Der').vector)
embedding_matrix_target = np.zeros((num_tokens_target, embedding_dim_target))
for i, word in enumerate(vocab_target):
    embedding_matrix_target[i] = nlp_target(word).vector

- run time: 1m 44 s

In [71]:
# Add row of zeros (for index 0) to embedding_matrix_source and embedding_matrix target:

embedding_matrix_source = np.vstack ((np.zeros((1,embedding_matrix_source.shape[1])), embedding_matrix_source))
embedding_matrix_target = np.vstack ((np.zeros((1,embedding_matrix_target.shape[1])), embedding_matrix_target))


In [64]:
# Vocabulary of English
all_en_words=set()
for eng in pairs['english']:
    for word in eng.split():
        if word not in all_en_words:
            all_en_words.add(word)

# Vocabulary of German 
all_de_words=set()
for de in pairs['german']:
    for word in de.split():
        if word not in all_de_words:
            all_de_words.add(word)

# Max Length of source sequence
length_list=[]
for l in pairs['english']:
    length_list.append(len(l.split(' ')))
max_length_src = np.max(length_list)

# Max Length of target sequence
length_list=[]
for l in pairs['german']:
    length_list.append(len(l.split(' ')))
max_length_tar = np.max(length_list)


input_words = sorted(list(all_en_words))
target_words = sorted(list(all_de_words))

# Calculate Vocab size for both source and target
num_encoder_tokens = len(all_en_words) + 1
num_decoder_tokens = len(all_de_words) + 1

#""" find out why you add 1"""
#num_decoder_tokens += 1 # For zero padding 

# Create word to token dictionary for both source and target
#input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
#target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])
input_word_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_word_index = dict([(word, i+1) for i, word in enumerate(target_words)])

# Create token to word dictionary for both source and target
# reverse_input_token_index = dict((i, word) for word, i in input_token_index.items())
# reverse_target_token_index = dict((i, word) for word, i in target_token_index.items())
input_index_word = dict((i, word) for word, i in input_word_index.items())
target_index_word = dict((i, word) for word, i in target_word_index.items())

In [66]:
num_encoder_tokens, num_decoder_tokens

(6448, 10068)