In [64]:
import pandas as pd
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku

In [65]:
lego = pd.read_csv('/content/lego_data_clean_translated.csv')
lego.head()


toy_name_en = lego['toy_name_en'].values
print(toy_name_en)

['Himeji Castle' 'New York City' 'London' ... 'Easter Bunny House'
 'Mighty Micros: Supergirl™ vs. Brainiac™'
 'Mighty Micros: Batman™ vs. Harley Quinn™']


In [69]:
def clean_text(txt):
    txt = "".join(t for t in txt if t not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii", 'ignore')
    return txt

toy_name_en_clean = [clean_text(x) for x in toy_name_en]
toy_name_en_clean[:10]

['himeji castle',
 'new york city',
 'london',
 'paris',
 'great pyramid of giza',
 'taj mahal',
 'singapore',
 'statue of liberty',
 'the white house',
 'batcave shadow box']

In [70]:
tokenizer = Tokenizer()
def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1

    ## convert data to a token sequence
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words
inp_sequences, total_words = get_sequence_of_tokens(corpus)
inp_sequences[:10]

[[697, 8],
 [67, 367],
 [67, 367, 27],
 [58, 698],
 [58, 698, 4],
 [58, 698, 4, 699],
 [700, 701],
 [703, 4],
 [703, 4, 369],
 [2, 170]]

In [71]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

print(predictors)
print(label)
print(max_sequence_len)

[[  0   0   0 ...   0   0 697]
 [  0   0   0 ...   0   0  67]
 [  0   0   0 ...   0  67 367]
 ...
 [  0   0   0 ... 216 696  23]
 [  0   0   0 ... 696  23  19]
 [  0   0   0 ...  23  19 241]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
10
