In [1]:
import pandas as pd
import string
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku
import pickle

2024-04-20 11:52:49.596878: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
lego = pd.read_csv('lego_data_clean_translated.csv')
lego.head()


toy_name_en = lego['toy_name_en'].values
print(toy_name_en)

['Himeji Castle' 'New York City' 'London' ... 'Easter Bunny House'
 'Mighty Micros: Supergirl™ vs. Brainiac™'
 'Mighty Micros: Batman™ vs. Harley Quinn™']


In [3]:
def clean_text(txt):
    txt = "".join(t for t in txt if t not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii", 'ignore')
    return txt

toy_name_en_clean = [clean_text(x) for x in toy_name_en]
toy_name_en_clean[:10]

['himeji castle',
 'new york city',
 'london',
 'paris',
 'great pyramid of giza',
 'taj mahal',
 'singapore',
 'statue of liberty',
 'the white house',
 'batcave shadow box']

In [4]:
tokenizer = Tokenizer()
def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1

    ## convert data to a token sequence
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(toy_name_en_clean)
inp_sequences[:10]

[[697, 8],
 [67, 367],
 [67, 367, 27],
 [58, 698],
 [58, 698, 4],
 [58, 698, 4, 699],
 [700, 701],
 [703, 4],
 [703, 4, 369],
 [2, 170]]

In [5]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

print(predictors)
print(label)
print(max_sequence_len)

[[  0   0   0 ...   0   0 697]
 [  0   0   0 ...   0   0  67]
 [  0   0   0 ...   0  67 367]
 ...
 [  0   0   0 ... 216 696  23]
 [  0   0   0 ... 696  23  19]
 [  0   0   0 ...  23  19 241]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
10


In [6]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1  # Length of input sequences
    model = Sequential()
    model.add(Embedding(total_words, 10, input_length=input_len))  # Input layer
    model.add(LSTM(100))  # LSTM layer with 100 units
    model.add(Dropout(0.1))  # Dropout for regularization
    model.add(Dense(total_words, activation='softmax'))  # Output layer
    model.compile(loss='categorical_crossentropy', optimizer='adam')  # Compile the model
    return model

model = create_model(max_sequence_len, total_words)
model.summary()



In [7]:
model.fit(predictors, label, epochs=250, verbose=1)

Epoch 1/250
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - loss: 7.1654
Epoch 2/250
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 6.4466
Epoch 3/250
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 6.3881
Epoch 4/250
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 6.3217
Epoch 5/250
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 6.2162
Epoch 6/250
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 6.1489
Epoch 7/250
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 6.1497
Epoch 8/250
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 6.0774
Epoch 9/250
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 5.9338
Epoch 10/250
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 5.7807

<keras.src.callbacks.history.History at 0x132b3fe90>

In [13]:
import joblib
import tensorflow as tf 

# Save a TensorFlow/Keras model
model.save('nlp_model.h5')  # Save the model to a directory

# Load the model back
loaded_model = tf.keras.models.load_model('nlp_model.h5')



NameError: name 'tf' is not defined

In [None]:
import numpy as np
from keras.preprocessing.sequence import pad_sequences
import random

def generate_text(seed_text, next_words, model, max_sequence_len, tokenizer):
    # If the seed_text is empty, randomly pick a word from the tokenizer's word index
    if seed_text == "":
        seed_text = random.choice(list(tokenizer.word_index.keys()))

    seed_text += " " + random.choice(list(tokenizer.word_index.keys()))

    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        predicted = np.argmax(predicted, axis=-1)  # Get the index of the maximum prediction

        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text.title()

In [None]:
# Example usage with an empty string as seed_text
seedtext = "City:"
setname = generate_text(seedtext, 3, model, max_sequence_len, tokenizer)
print(setname)

In [None]:
'''
!pip install openai
from openai import OpenAI

api_key = ''
client = OpenAI(api_key=api_key)

response = client.images.generate(
  model="dall-e-3",
  prompt="Generate image of a Lego set with the box in the background with the title: " + setname + " DO NOT GENERATE ANY COPYRIGHT CONTENT",
  size="1920x1080",
  quality="standard",
  n=1,
)

image_url = response.data[0].url
print(image_url)
'''