## Language Translator

In [1]:
import string
import re
from numpy import array, argmax, random, take, delete
import pandas as pd
import nltk
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Bidirectional, RepeatVector, TimeDistributed
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras import optimizers
import matplotlib.pyplot as plt

pd.set_option('display.max_colwidth', 200)

In [24]:
import nltk
from nltk.tokenize import word_tokenize

From `nltk` we can download translated sentences between different languages. You can see the example between **English and French** below but feel free to try different combination as well.

In [2]:
nltk.download('comtrans')

[nltk_data] Downloading package comtrans to /Users/jandas/nltk_data...
[nltk_data]   Package comtrans is already up-to-date!


True

In [3]:
from nltk.corpus import comtrans
print(comtrans.aligned_sents('alignment-en-fr.txt')[0])

<AlignedSent: 'Resumption of the se...' -> 'Reprise de la sessio...'>


In [4]:
len(comtrans.aligned_sents('alignment-en-fr.txt'))

33334

In [5]:
comtrans.aligned_sents('alignment-en-fr.txt')

[AlignedSent(['Resumption', 'of', 'the', 'session'], ['Reprise', 'de', 'la', 'session'], Alignment([(0, 0), (1, 1), (2, 2), (3, 3)])), AlignedSent(['I', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'European', 'Parliament', 'adjourned', 'on', 'Friday', '17', 'December', '1999', ',', 'and', 'I', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period', '.'], ['Je', 'déclare', 'reprise', 'la', 'session', 'du', 'Parlement', 'européen', 'qui', 'avait', 'été', 'interrompue', 'le', 'vendredi', '17', 'décembre', 'dernier', 'et', 'je', 'vous', 'renouvelle', 'tous', 'mes', 'vux', 'en', 'espérant', 'que', 'vous', 'avez', 'passé', 'de', 'bonnes', 'vacances', '.'], Alignment([(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 5), (7, 7), (8, 6), (9, 8), (9, 9), (9, 10), (9, 11), (10, 12), (11, 13), (12, 14), (13, 15), (14, 16), (16, 17), (17, 18), (18, 19), (19, 20), (20, 20), (2

In [6]:
sents = comtrans.aligned_sents("alignment-en-fr.txt")

In [7]:
len(sents)

33334

In [26]:
en_tokens = [lst.words for lst in sents]
fr_tokens = [lst.mots for lst in sents]

In [38]:
fr_tokens[1]

['Je',
 'déclare',
 'reprise',
 'la',
 'session',
 'du',
 'Parlement',
 'européen',
 'qui',
 'avait',
 'été',
 'interrompue',
 'le',
 'vendredi',
 '17',
 'décembre',
 'dernier',
 'et',
 'je',
 'vous',
 'renouvelle',
 'tous',
 'mes',
 'vux',
 'en',
 'espérant',
 'que',
 'vous',
 'avez',
 'passé',
 'de',
 'bonnes',
 'vacances',
 '.']

In [36]:
# remove all tokens that are not alphabetic
#fr_words = [word for lst in fr_tokens if word.isalpha()]
fr_words = []
for lst in fr_tokens:
    for word in lst:
        if word.isalpha():
            fr_words.append(word)
print(fr_words[:100])

['Reprise', 'de', 'la', 'session', 'Je', 'déclare', 'reprise', 'la', 'session', 'du', 'Parlement', 'européen', 'qui', 'avait', 'été', 'interrompue', 'le', 'vendredi', 'décembre', 'dernier', 'et', 'je', 'vous', 'renouvelle', 'tous', 'mes', 'vux', 'en', 'espérant', 'que', 'vous', 'avez', 'passé', 'de', 'bonnes', 'vacances', 'Vous', 'avez', 'souhaité', 'un', 'débat', 'à', 'ce', 'sujet', 'dans', 'les', 'prochains', 'jours', 'au', 'cours', 'de', 'cette', 'période', 'de', 'session', 'Je', 'vous', 'invite', 'à', 'vous', 'lever', 'pour', 'cette', 'minute', 'de', 'silence', 'Le', 'Parlement', 'debout', 'observe', 'une', 'minute', 'de', 'silence', 'Madame', 'la', 'Présidente', 'est', 'une', 'motion', 'de', 'procédure', 'Vous', 'avez', 'probablement', 'appris', 'par', 'la', 'presse', 'et', 'par', 'la', 'télévision', 'que', 'plusieurs', 'attentats', 'à', 'la', 'bombe', 'et']


In [44]:
# remove all tokens that are not alphabeticen_words = []
for lst in en_tokens:
    for word in lst:
        if word.isalpha():
            en_words.append(word)
print(en_words[:100])

['Resumption', 'of', 'the', 'session', 'I', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'European', 'Parliament', 'adjourned', 'on', 'Friday', 'December', 'and', 'I', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period', 'You', 'have', 'requested', 'a', 'debate', 'on', 'this', 'subject', 'in', 'the', 'course', 'of', 'the', 'next', 'few', 'days', 'during', 'this', 'Please', 'rise', 'then', 'for', 'this', 'minute', 's', 'silence', 'The', 'House', 'rose', 'and', 'observed', 'a', 'minute', 's', 'silence', 'Madam', 'President', 'on', 'a', 'point', 'of', 'order', 'You', 'will', 'be', 'aware', 'from', 'the', 'press', 'and', 'television', 'that', 'there', 'have', 'been', 'a', 'number', 'of', 'bomb', 'explosions']


In [40]:
len(en_words)

597117

In [41]:
len(fr_words)

616875

In [45]:
fr_tokens = [w.lower() for w in fr_words]

In [48]:
en_tokens = [w.lower() for w in en_words]

In [49]:
len(en_tokens)

1194234

In [None]:
maxlen = 25
t = Tokenizer(char_level=True)
t.fit_on_texts(names)
tokenized = t.texts_to_sequences(names)
padded_names = preprocessing.sequence.pad_sequences(tokenized, maxlen=maxlen)
print(padded_names.shape)