Kood põhineb järgmisel juhendil https://data-flair.training/blogs/python-based-project-image-caption-generator-cnn/

In [3]:
import os

import cv2
import pickle
import numpy as np


from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers.merge import add
from keras.models import Model, load_model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout

Using TensorFlow backend.


In [4]:
BASEDIR = "files_without_pdf"
FILES_NUMBER = 5000
NOTES_AND_KEYS = {'c\'': 1, 'd\'': 2, 'e\'': 3, 'f\'': 4, 'g\'': 5, 'a\'': 6, 'b\'': 7, 'c\'\'': 8}
START = 0
END = 9
IMG_WIDTH = 142
IMG_HEIGHT = 50

In [39]:
def create_img_pickle():
    images = {}
    for i in range(FILES_NUMBER):
        image_file = str(i) + '.png'
        image_path = os.path.join(BASEDIR, image_file)
        img = cv2.imread(image_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        img = cv2.resize(img, (int(IMG_WIDTH), int(IMG_HEIGHT)))
        img = img.reshape(-1)
        images[i] = img
    with open('images.p', 'wb') as f:
        pickle.dump(images, f)
    return images

In [6]:
def create_notes_pickle():
    notes = {}
    for i in range(FILES_NUMBER):
        lily_file = str(i) + '.ly'
        lily_path = os.path.join(BASEDIR, lily_file)
        with open(lily_path, 'r') as f:
            lines = f.readlines()
            notes_str = lines[2].strip()
            tagged_notes = '<start> ' + notes_str + ' <end>'
            notes[i] = tagged_notes
    with open('notes.p', 'wb') as f:
        pickle.dump(notes, f)
    return notes

In [7]:
def create_tokenizer(notes_dict):
    notes_list = notes_dict.values()
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(notes_list)
    with open('tokenizer.p', 'wb') as f:
        pickle.dump(tokenizer, f)
    return tokenizer

In [40]:
img = create_img_pickle()
note = create_notes_pickle()
tok = create_tokenizer(note)

In [9]:
vocab_size = len(tok.word_index) + 1
vocab_size

11

In [10]:
def max_length(notes_list):
    return max(len(d.split()) for d in notes_list)
    
max_length = max_length(note.values())
max_length

10

In [11]:
def create_sequences(tokenizer, max_length, notes, image, vocab_size):
    X1, X2, y = list(), list(), list()
    # encode the sequence
    seq = tokenizer.texts_to_sequences([notes])[0]
    # split one sequence into multiple X,y pairs
    for i in range(1, len(seq)):
        # split into input and output pair
        in_seq, out_seq = seq[:i], seq[i]
        # pad input sequence
        in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
        # encode output sequence
        out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
        # store
        X1.append(image)
        X2.append(in_seq)
        y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

In [51]:
def data_generator(notes, images, tokenizer, max_length, vocab_size):
    while 1:
        for key, note in notes.items():
            image = images[key]
            input_image, input_sequence, output_word = create_sequences(tokenizer, max_length, note, image, vocab_size)
            yield [[input_image, input_sequence], output_word]

In [52]:
[a,b],c = next(data_generator(note, img, tok, max_length, vocab_size))
a.shape, b.shape, c.shape

((3, 7100), (3, 10), (3, 11))

In [43]:
from keras.utils import plot_model

def define_model(vocab_size, max_length):
    # features from the CNN model squeezed from 2048 to 256 nodes
    inputs1 = Input(shape=(7100,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    # LSTM sequence model
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    # Merging both models
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    # tie it together [image, seq] [word]
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    # summarize model
    print(model.summary())
    return model

In [53]:
model = define_model(vocab_size, max_length)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_12 (InputLayer)           (None, 10)           0                                            
__________________________________________________________________________________________________
input_11 (InputLayer)           (None, 7100)         0                                            
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, 10, 256)      2816        input_12[0][0]                   
__________________________________________________________________________________________________
dropout_11 (Dropout)            (None, 7100)         0           input_11[0][0]                   
__________________________________________________________________________________________________
dropout_12

In [54]:
epochs = 100
steps = FILES_NUMBER

for i in range(epochs):
    generator = data_generator(note, img, tok, max_length, vocab_size)
    model.fit_generator(generator, epochs=1, steps_per_epoch= steps, verbose=1)
    model.save("models/model_" + str(i) + ".h5")


Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
 108/5000 [..............................] - ETA: 6:21 - loss: 14.7734

KeyboardInterrupt: 

In [71]:
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
         if index == integer:
             return word
    return None

def generate_desc(model, tokenizer, photo, max_length):
    in_text = '<start>'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        pred = model.predict([photo,sequence], verbose=0)
        pred = np.argmax(pred)
        word = word_for_id(pred, tokenizer)
        if word is None:
            break
        in_text += ' ' + word
        if word == 'end':
            break
    return in_text

In [72]:
max_length = 10
tokenizer = pickle.load(open("tokenizer.p","rb"))
model = load_model('models/model_9.h5')
photos = pickle.load(open("images.p", "rb"))
photo = list()
photo.append(photos[0])
photo = np.array(photo)
description = generate_desc(model, tokenizer, photo, max_length)
print("Ennustus: " + description)

Ennustus: <start> g' g' g' g' g' g' g' g' g' g'
