In [12]:
from keras.models import Model, model_from_json, load_model
from keras.layers import Input, LSTM, Dense
from keras.callbacks import ModelCheckpoint
import numpy as np
import keras
from tqdm import tqdm
import pandas as pd
import pickle

In [3]:
keras.__version__

'2.12.0'

In [9]:
df_train = pd.read_csv("../ukr_noun_inflection/data/train.csv")[["base form", "concat form","inflected form", "number", "case"]]
df_test = pd.read_csv("../ukr_noun_inflection/data/test.csv")[["base form", "concat form","inflected form", "number", "case"]]

In [10]:
df_train.shape, df_test.shape

((19650, 5), (6592, 5))

In [11]:
df_train.head(10)

Unnamed: 0,base form,concat form,inflected form,number,case
0,автофура,автофура1N,автофура,1,N
1,автофура,автофура1G,автофури,1,G
2,автофура,автофура1D,автофурі,1,D
3,автофура,автофура1A,автофуру,1,A
4,автофура,автофура1I,автофурою,1,I
5,автофура,автофура1L,автофурі,1,L
6,автофура,автофура1V,автофуро,1,V
7,автофура,автофура2N,автофури,2,N
8,автофура,автофура2G,автофур,2,G
9,автофура,автофура2D,автофурам,2,D


In [13]:
input = []
output = []
inp_chars = set()
out_chars = set()
nb_samples = df_train.shape[0]

In [14]:
# Process nouns
for line in tqdm(range(nb_samples)):
    row = df_train.iloc[line]
    inp = row["concat form"]
    out = '\t' + row["inflected form"] + '\n'
    input.append(inp)
    output.append(out)

    for ch in inp:
        if (ch not in inp_chars):
            inp_chars.add(ch)
    
    for ch in out:
        if (ch not in out_chars):
            out_chars.add(ch)


100%|██████████| 19650/19650 [00:00<00:00, 22680.22it/s]


In [16]:
out_chars = sorted(list(out_chars))
inp_chars = sorted(list(inp_chars))

' '.join(inp_chars)

"' - 1 2 A D G I L N V а б в г д е ж з и й к л м н о п р с т у ф х ц ч ш щ ь ю я є і ї ґ"

In [18]:
# dictionary to index each input character - key is index and value is character
inp_index_to_char_dict = {}

# dictionary to get input character given its index - key is character and value is index
inp_char_to_index_dict = {}

for k, v in enumerate(inp_chars):
    inp_index_to_char_dict[k] = v
    inp_char_to_index_dict[v] = k


# dictionary to index each output character - key is index and value is character
out_index_to_char_dict = {}

# dictionary to get output character given its index - key is character and value is index
out_char_to_index_dict = {}

for k, v in enumerate(out_chars):
    out_index_to_char_dict[k] = v
    out_char_to_index_dict[v] = k

In [21]:
max_len_input = df_train["concat form"].str.len().max() + 5
max_len_output = df_train["inflected form"].str.len().max() + 5

In [23]:
data = {}

data['inp_chars'] = inp_chars
data['out_chars'] = out_chars
data['max_len_input'] = max_len_input
data['max_len_output'] = max_len_output
data['inp_index_to_char_dict'] = inp_index_to_char_dict
data['inp_char_to_index_dict'] = inp_char_to_index_dict
data['out_index_to_char_dict'] = out_index_to_char_dict
data['out_char_to_index_dict'] = out_char_to_index_dict

In [24]:
#pickling data for inference model
pickle.dump(data, open("data.p", "wb" ))

In [36]:
tokenized_input = np.zeros(shape = (nb_samples, max_len_input, len(inp_chars)), dtype='float32')
tokenized_output = np.zeros(shape = (nb_samples, max_len_output, len(out_chars)), dtype='float32')
target_data = np.zeros((nb_samples, max_len_output, len(out_chars)), dtype='float32')

In [37]:
# Vectorize nouns
for i in tqdm(range(nb_samples)):
    for k,ch in enumerate(input[i]):
        tokenized_input[i ,k, inp_char_to_index_dict[ch]] = 1

    for k,ch in enumerate(output[i]):
        tokenized_output[i, k, out_char_to_index_dict[ch]] = 1

        # decoder_target_data will be ahead by one timestep and will not include the start character.
        if k > 0:
            target_data[i, k-1, out_char_to_index_dict[ch]] = 1

100%|██████████| 19650/19650 [00:00<00:00, 72869.73it/s]


In [38]:
# Encoder model
encoder_input = Input(shape=(None, len(inp_chars)))
encoder_LSTM = LSTM(256, return_state = True)
encoder_outputs, encoder_h, encoder_c = encoder_LSTM (encoder_input)
encoder_states = [encoder_h, encoder_c]

2023-05-25 23:02:39.344131: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-25 23:02:39.345436: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-25 23:02:39.346222: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [39]:
# Decoder model
decoder_input = Input(shape=(None, len(out_chars)))
decoder_LSTM = LSTM(256,return_sequences=True, return_state = True)
decoder_out, _ , _ = decoder_LSTM(decoder_input, initial_state=encoder_states)
decoder_dense = Dense(len(out_chars), activation='softmax')
decoder_out = decoder_dense (decoder_out)

2023-05-25 23:02:41.354208: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-25 23:02:41.355276: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-25 23:02:41.356152: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [40]:
model = Model(inputs=[encoder_input, decoder_input], outputs=[decoder_out])

In [41]:
%%time
model = Model(inputs=[encoder_input, decoder_input], outputs=[decoder_out])

# Run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

filepath="weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

model.fit(x=[tokenized_input, tokenized_output], 
          y=target_data,
          batch_size=64,
          epochs=50,
          validation_split=0.2,
          #callbacks=callbacks_list,
         verbose = 2)


Epoch 1/50


2023-05-25 23:02:45.510489: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-25 23:02:45.512138: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-25 23:02:45.513077: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

246/246 - 36s - loss: 1.2046 - accuracy: 0.0548 - val_loss: 1.1854 - val_accuracy: 0.0583 - 36s/epoch - 145ms/step
Epoch 2/50
246/246 - 33s - loss: 1.1768 - accuracy: 0.0613 - val_loss: 1.1761 - val_accuracy: 0.0636 - 33s/epoch - 132ms/step
Epoch 3/50
246/246 - 30s - loss: 1.1609 - accuracy: 0.0656 - val_loss: 1.1528 - val_accuracy: 0.0705 - 30s/epoch - 123ms/step
Epoch 4/50
246/246 - 30s - loss: 1.1430 - accuracy: 0.0711 - val_loss: 1.1729 - val_accuracy: 0.0618 - 30s/epoch - 122ms/step
Epoch 5/50
246/246 - 32s - loss: 1.1180 - accuracy: 0.0789 - val_loss: 1.1494 - val_accuracy: 0.0686 - 32s/epoch - 129ms/step
Epoch 6/50
246/246 - 34s - loss: 1.0955 - accuracy: 0.0864 - val_loss: 1.0979 - val_accuracy: 0.0887 - 34s/epoch - 140ms/step
Epoch 7/50
246/246 - 22s - loss: 1.0728 - accuracy: 0.0942 - val_loss: 1.0920 - val_accuracy: 0.0917 - 22s/epoch - 90ms/step
Epoch 8/50
246/246 - 27s - loss: 1.0433 - accuracy: 0.1032 - val_loss: 1.0593 - val_accuracy: 0.0979 - 27s/epoch - 111ms/step
Epoc

<keras.callbacks.History at 0x7f30f8388790>

In [42]:
def decode_seq(inp_seq, encoder_model_inf, decoder_model_inf):
    
    # Initial states value is coming from the encoder 
    states_val = encoder_model_inf.predict(inp_seq)
    
    target_seq = np.zeros((1, 1, len(out_chars)))
    target_seq[0, 0, out_char_to_index_dict['\t']] = 1
    
    translated_sent = ''
    stop_condition = False
    
    prob = 1.0
    while not stop_condition:
        
        #decoder_out, decoder_h, decoder_c = decoder_model_inf.predict(x=[target_seq] + states_val)
        #predict_pr = decoder_model_inf.predict_proba(x=[target_seq] + states_val)
        decoder_out, decoder_h, decoder_c = decoder_model_inf.predict(x=[target_seq] + states_val)
        #print(decoder_out)
        max_val_index = np.argmax(decoder_out[0,-1,:])
        max_val = np.max(decoder_out[0,-1,:])
        prob *= max_val
        sampled_fra_char = out_index_to_char_dict[max_val_index]
        translated_sent += sampled_fra_char
        print('{} == {}'.format(sampled_fra_char,max_val))
        
        if ( (sampled_fra_char == '\n') or (len(translated_sent) > max_len_output)) :
            stop_condition = True
        
        target_seq = np.zeros((1, 1, len(out_chars)))
        target_seq[0, 0, max_val_index] = 1
        
        states_val = [decoder_h, decoder_c]
    
    prob = prob**(1/len(translated_sent))
    return translated_sent, prob


In [44]:
# Inference models for testing
# Encoder inference model
encoder_model_inf = Model(encoder_input, encoder_states)

In [49]:
# Decoder inference model
decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_input_states = [decoder_state_input_h, decoder_state_input_c]

decoder_out, decoder_h, decoder_c = decoder_LSTM(decoder_input, 
                                                 initial_state=decoder_input_states)

decoder_states = [decoder_h , decoder_c]

decoder_out = decoder_dense(decoder_out)

decoder_model_inf = Model(inputs=[decoder_input] + decoder_input_states,
                          outputs=[decoder_out] + decoder_states )

2023-05-25 23:30:06.297794: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-25 23:30:06.298692: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-25 23:30:06.299402: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [53]:
def tokenize(seq):
    tokenized_eng_sentence = np.zeros(shape = (1, max_len_input, len(inp_chars)), dtype='float32')
    for k,ch in enumerate(seq):
        tokenized_eng_sentence[0, k, inp_char_to_index_dict[ch]] = 1
        
    return tokenized_eng_sentence

In [56]:
inp = 'кіт1G'
inp_seq = tokenize(inp)
translated_sent, prob = decode_seq(inp_seq, encoder_model_inf, decoder_model_inf)
prob

і == 0.8185333609580994
к == 0.6298231482505798
к == 0.13716712594032288
і == 0.06030400097370148

 == 0.06717141717672348


0.19561724404877676