In [1]:
import pandas as pd
import numpy as np
import string
from string import digits
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import tensorflow as tf
from tensorflow.keras.layers import Bidirectional, Concatenate, LSTM, Embedding, Dense, MultiHeadAttention, LayerNormalization, Dropout
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.initializers import Constant
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
import re
import os
import io
import time

In [2]:
from model_components import preprocess_sentence, get_angles, positional_encoding, create_padding_mask, create_look_ahead_mask, \
                                    FullyConnected, EncoderLayer, Encoder, DecoderLayer, Decoder, Transformer, CustomSchedule, \
                                        create_train_tokenizer, load_tokenizer

In [3]:
from tokenizers import Tokenizer, ByteLevelBPETokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing

<h3> Load and pre-process European parliament data </h3>

In [4]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, mode='rt', encoding='utf-8')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# split a loaded document into sentences
def to_sentences(doc):
	return doc.strip().split('\n')

In [5]:
filename_en = 'europarl_de-en/europarl-v7.de-en.en'
text_en = load_doc(filename_en)
doc_en = to_sentences(text_en)

filename_de = 'europarl_de-en/europarl-v7.de-en.de'
text_de = load_doc(filename_de)
doc_de = to_sentences(text_de)

In [6]:
both_languages = zip(doc_de, doc_en)
df_euro_parl = pd.DataFrame(both_languages, columns = ['german', 'english'])

In [7]:
# replace empty strings with NaN
df_euro_parl['german'].replace('', np.nan, inplace=True)
df_euro_parl['english'].replace('', np.nan, inplace=True)

In [8]:
# check number of NaN values
df_euro_parl['german'].isna().sum(), df_euro_parl['english'].isna().sum()

(2923, 8366)

In [11]:
# drop NaN values
df_euro_parl = df_euro_parl.dropna()

<h3> Load and PreProcess ManyThings data </h3>

In [12]:
df_ManyThings = pd.read_table('deu-eng/deu.txt', names=['eng', 'deu', 'attr'])
df_ManyThings = df_ManyThings.drop('attr',axis = 1).rename(columns = {'eng':'english', 'deu':'german'})

In [13]:
#df_ManyThings['english'] = df_ManyThings['english'].apply(preprocess_sentence)
#df_ManyThings['german'] = df_ManyThings['german'].apply(preprocess_sentence)

# switch order of columns
df_ManyThings = df_ManyThings[['german', 'english']]

In [14]:
# replace empty strings with NaN
df_ManyThings['german'].replace('', np.nan, inplace=True)
df_ManyThings['english'].replace('', np.nan, inplace=True)

In [15]:
# check number of NaN values
df_ManyThings['german'].isna().sum(), df_ManyThings['english'].isna().sum()

(0, 0)

<h3> Combine two dataframes </h3>

In [16]:
df_complete = pd.concat([df_euro_parl, df_ManyThings], axis = 0)

In [17]:
df_complete.tail()

Unnamed: 0,german,english
251715,"Wenn jemand Fremdes dir sagt, dass du dich wie...",If someone who doesn't know your background sa...
251716,"Wenn jemand, der nicht weiß, woher man kommt, ...",If someone who doesn't know your background sa...
251717,"Es ist wohl unmöglich, einen vollkommen fehler...",It may be impossible to get a completely error...
251718,"Ich weiß wohl, dass das ausschließliche Beitra...",I know that adding sentences only in your nati...
251719,Ohne Zweifel findet sich auf dieser Welt zu je...,Doubtless there exists in this world precisely...


In [18]:
#preprocess data
df_complete['german'] = df_complete['german'].apply(preprocess_sentence)
df_complete['english'] = df_complete['english'].apply(preprocess_sentence)

In [19]:
for i in range (498,500):
    print(df_complete.iloc[i,0])
    print( df_complete.iloc[i,1])

Lassen Sie mich das begruenden.
I would like to explain our thinking here.
Es geht uns erstens um eine ordnungsgemaesse Verwendung der Mittel aus den Struktur- und dem Kohaesionsfonds.
Firstly, we are concerned with the proper use of the Structural and Cohesion Funds.


<h3> Create english corpus and german corpus </h3>

- create corpus for english and german to train BPE tokenizers on

In [20]:
with open ('corpus/en_corpus.txt', 'w') as f:
    for idx, row in df_complete.iterrows():
        f.write(row['english'] + ' \n')

with open ('corpus/de_corpus.txt', 'w') as f:
    for idx, row in df_complete.iterrows():
        f.write(row['german'] + ' \n')
    

- run time: 1m 16 s

<h3> Create bpe_tokenizers </h3>

In [23]:
en_tokenizer = create_train_tokenizer(['en_corpus.txt'])
de_tokenizer = create_train_tokenizer(['de_corpus.txt'])




json file saved at: tokenizer_en_corpus.json



json file saved at: tokenizer_de_corpus.json


In [24]:
# test tokenizers
en_tokenizer, en_word_index = load_tokenizer('tokenizer_en_corpus.json')
de_tokenizer, de_word_index = load_tokenizer('tokenizer_de_corpus.json')
sentence = "In diesem      Sinne   , möchte ich Sie bitten, nur auf kurzerer Zeit eine Mail zu schicken."
sentence = preprocess_sentence(sentence)
output = de_tokenizer.encode(sentence)
output.tokens, output.ids


(['start_',
  'In',
  'Ġdiesem',
  'ĠSinne',
  'Ġ,',
  'Ġmoechte',
  'Ġich',
  'ĠSie',
  'Ġbitten',
  ',',
  'Ġnur',
  'Ġauf',
  'Ġkurz',
  'erer',
  'ĠZeit',
  'Ġeine',
  'ĠMa',
  'il',
  'Ġzu',
  'Ġschicken',
  '.',
  '_end'],
 [1,
  759,
  629,
  3084,
  4034,
  585,
  406,
  591,
  3197,
  14,
  631,
  359,
  1872,
  8016,
  838,
  390,
  1014,
  473,
  300,
  12795,
  16,
  2])

<h3> Save df_complete as csv file </h3>

In [25]:
df_complete.to_csv('df_complete.csv', index = False)

In [26]:
df_complete = pd.read_csv('df_complete.csv')

In [27]:
df_complete.isna().sum()

german     2
english    2
dtype: int64

In [28]:
# Mysteriously, there are two nan values. We'll simply drop them.
df_complete = df_complete.dropna()

In [29]:
df_complete.to_csv('df_complete.csv', index = False)

<h3> Create version of dataframe with capped sentence length </h3>

In [30]:
df_complete_30 = df_complete.copy(deep = True)
df_complete_30['german_length'] = df_complete['german'].apply(lambda x: len(x.split(' ')))
df_complete_30['english_length'] = df_complete['english'].apply(lambda x: len(x.split(' ')))

In [31]:
df_complete_30.head()

Unnamed: 0,german,english,german_length,english_length
0,Wiederaufnahme der Sitzungsperiode,Resumption of the session,3,4
1,"Ich erklaere die am Freitag, dem 17. Dezember ...",I declare resumed the session of the European ...,29,38
2,"Wie Sie feststellen konnten, ist der gefuercht...","Although, as you will have seen, the dreaded '...",22,31
3,Im Parlament besteht der Wunsch nach einer Aus...,You have requested a debate on this subject in...,16,19
4,Heute moechte ich Sie bitten - das ist auch de...,"In the meantime, I should like to observe a mi...",33,40


In [32]:
mask_30 = (df_complete_30['german_length'] <= 30) & (df_complete_30['english_length'] <= 30)
df_complete_30 = df_complete_30[mask_30]

In [35]:
len(df_complete), len(df_complete_30)

(2160638, 1562141)

In [36]:
df_complete_30.to_csv('df_complete_30.csv', index = False)