## Звуко-слово. Прилагательное, 9 букв, именительный падеж, единственное число

In [1]:
from keras.layers import LSTM, Bidirectional, Dropout, Dense, TimeDistributed, Input, Embedding, Flatten, Add, Reshape, Concatenate, Lambda, Activation
from keras.models import Model
from keras.backend.tensorflow_backend import set_session
from sklearn.model_selection import train_test_split
from keras.metrics import categorical_accuracy
from keras.preprocessing import sequence
from keras import optimizers
import tensorflow as tf
import MySQLdb
import numpy as np
import re

DROPOUT = 0.1
UNITS = 128
WORD_SET = ' -абвгдеёжзийклмнопрстуфхцчшщъыьэюя'
SOUND_WORD_SET = ' аеёиоуыэюяа\'е\'ё\'и\'о\'у\'ы\'э\'ю\'я\'бб\'вв\'гг\'дд\'жзз\'йкк\'лл\'мм\'нн\'пп\'рр\'сс\'тт\'фф\'хх\'цчшщ'
WORD_MAX_LENGTH = 30
STRESS_SOUDNS = ('а\'','е\'','и\'','о\'','у\'','ы\'','э\'','ю\'','я\'')
SOUND_WORD_MAX_LENGTH = 30
PHONOSEM_VALUES_LEN = 25

WORD_LENGTH = 9

WORD_DIC = {k: v for v, k in enumerate(WORD_SET)}
SOUND_WORD_DIC = {k: i for i, k in enumerate(re.findall('[\sа-яё]\'?', SOUND_WORD_SET))}
VOCAB_LEN = len(WORD_DIC)
SOUND_VOCAB_LEN = len(SOUND_WORD_DIC)

Using TensorFlow backend.


In [2]:
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.9
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config))

## Подготовка данных

In [3]:
db_conn = MySQLdb.Connect(host='localhost', user='root', passwd='toPanta24', db='words_dataset', use_unicode=True,
                                  charset="utf8")
#x_words = []
x_sound_words = []
x_stresses = []
x_phonosem_values = []

with db_conn as db_curr:
    db_curr.execute('select distinct rnd_id, sound_word, primary_stress, type_id, value from words_rnd inner join phonosemantics on phonosemantics.word_id = words_rnd.id inner join morphemes on morphemes.word_id = words_rnd.id where secondary_stress IS NULL and char_length(word)=%d and not convert(word using \'CP1251\') regexp \'[ьъ]\' and morphemes.pos = \'ADJF\' and morphemes.case = \'nomn\' and morphemes.number=\'sing\' order by rnd_id' % (WORD_LENGTH,))
    row = db_curr.fetchmany(PHONOSEM_VALUES_LEN)
    while row:
        phonosem_values = np.zeros(PHONOSEM_VALUES_LEN)
        stresses = np.zeros(WORD_MAX_LENGTH)
        for (rnd_id, sound_word, primary_stress, type_id, phonosem_value) in row:
            phonosem_values[type_id - 1] = phonosem_value / 6
        stresses[primary_stress] = 1
#         if secondary_stress:
#             stresses[secondary_stress] = 1
        #x_words.append(word)
        x_sound_words.append(sound_word)
        x_stresses.append(stresses)
        x_phonosem_values.append(phonosem_values)
        row = db_curr.fetchmany(PHONOSEM_VALUES_LEN)

  


In [4]:
def char_to_1_hot_tensor(char):
    char_id = WORD_DIC[char]
    hot_vec = np.zeros((VOCAB_LEN))
    hot_vec[char_id] = 1.
    return hot_vec

def word_to_1_hot_tensors(word):
    hot_vec = np.zeros((WORD_MAX_LENGTH, VOCAB_LEN))
    for i, char in enumerate(word):
        hot_vec[i,:] = char_to_1_hot_tensor(char)
    return hot_vec

def words_to_1_hot_tensors(words):
    hot_vec = []
    for word in words:
        hot_vec.append(word_to_1_hot_tensors(word))
    return np.array(hot_vec)

##############################################################

def sound_char_to_1_hot_tensor(sound_char):
    sound_char_id = SOUND_WORD_DIC[sound_char]
    hot_vec = np.zeros((SOUND_VOCAB_LEN))
    hot_vec[sound_char_id] = 1.
    return hot_vec

def sound_word_to_1_hot_tensors(sound_word, max_length=SOUND_WORD_MAX_LENGTH):
    hot_vec = np.zeros((max_length, SOUND_VOCAB_LEN))
    for i, sound_char in enumerate(re.findall('[\sа-яё]\'?', sound_word)):
        hot_vec[i,:] = sound_char_to_1_hot_tensor(sound_char)
    return hot_vec

def sound_words_to_1_hot_tensors(sound_words, max_length, p=None):
    hot_vec = []
    for sound_word in sound_words:
        hot_vec.append(sound_word_to_1_hot_tensors(sound_word, max_length))
    hot_vec = np.array(hot_vec)
    if p==None:
        return hot_vec
    
    p_vec = []
    for j in range(len(hot_vec)):
         p_vec.append(hot_vec[j][p])
    return np.array(p_vec)


def first_sound_char_tensor_to_sound_char(sound_char_tensor):
    first_char_idx = np.argmax(sound_char_tensor)
    for c, i in SOUND_WORD_DIC.items():
        if i == first_char_idx:
            return c
        
def sound_words_tensor_to_sound_chars(sound_char_tensor):
    sound_chars_dic = {}
    for c, i in SOUND_WORD_DIC.items():
        sound_chars_dic[c] = sound_char_tensor[i]
    return sorted(sound_chars_dic.items(), key=lambda kv: kv[1])

##############################################################

def get_first_value(values):
    np_values = np.array(values)
    return np_values[:,0] # first value of phonosemantics

def sound_word_to_stress_char_idx(sound_word, stress):
    stress_idx = np.where(stress==1)[0][0]
    stressed_char = sound_word[stress_idx]
    return SOUND_WORD_DIC[stressed_char]
    
def sound_word_to_first_char_idx(sound_word):
    sound_chars = re.findall('[\sа-яё]\'?', sound_word)
    return SOUND_WORD_DIC[sound_chars[0]]
    
def sound_word_to_first_char_tensor(sound_word):
    tensor = np.zeros(SOUND_VOCAB_LEN)
    tensor[sound_word_to_first_char_idx(sound_word)] = 1
    return tensor

def sound_word_to_stress_char_tensor(sound_word, stress):
    tensor = np.zeros(SOUND_VOCAB_LEN)
    tensor[sound_word_to_stress_char_idx(sound_word, stress)] = 1
    return tensor

def get_sound_words_first_chars_tensors(sound_words):
    tensors = []
    for sound_word in sound_words:
        tensors.append(sound_word_to_first_char_tensor(sound_word))
    return np.array(tensors)

def get_sound_words_stress_chars_tensors(sound_words, stresses):
    tensors = []
    for i in range(len(sound_words)):
        tensors.append(sound_word_to_stress_char_tensor(sound_words[i], stresses[i]))
    return np.array(tensors)

def get_sound_word_to_stress_char_tensor(sound_word):
    tensor = np.zeros(SOUND_VOCAB_LEN)
    sound_chars = re.findall('[\sа-яё]\'?', sound_word)
    for sound_char in sound_chars:
        if sound_char in STRESS_SOUDNS:
            tensor[SOUND_WORD_DIC[sound_char]] = 1
    return tensor

def get_sound_words_to_stress_char_tensors(sound_words):
    tensors = []
    for sound_word in sound_words:
        tensors.append(get_sound_word_to_stress_char_tensor(sound_word))
    return np.array(tensors)

##############################################################

def sound_word_delete_first_and_stress(sound_word):
    new_sound_word = []
    sound_chars = re.findall('[\sа-яё]\'?', sound_word)
    for sound_char in range(1, len(sound_chars)):
        if sound_char not in STRESS_SOUDNS:
            new_sound_word.append(sound_char)
    return new_sound_word

def sound_word_to_vocab_freq_tensor(sound_word):
    sound_word = sound_word_delete_first_and_stress(sound_word)
    vocab = np.zeros(SOUND_VOCAB_LEN)
    for c, i in SOUND_WORD_DIC.items():
        vocab[i] = sound_word.count(c)
    return vocab

def sound_words_to_vocab_freq_tensors(sound_words):
    tensors = []
    for sound_word in sound_words:
        tensors.append(sound_word_to_vocab_freq_tensor(sound_word))
    return np.array(tensors)

##############################################################

def sound_word_to_vocab_part_freq_tensor(sound_word):
    vocab = np.zeros(SOUND_VOCAB_LEN)
    for c, i in SOUND_WORD_DIC.items():
        vocab[i] = sound_word.count(c)/len(sound_word)
    return vocab

def sound_words_to_vocab_part_freq_tensors(sound_words):
    tensors = []
    for sound_word in sound_words:
        tensors.append(sound_word_to_vocab_part_freq_tensor(sound_word))
    return np.array(tensors)


def sound_char_tensor_to_sound_char(sound_char_tensor):
    sound_char_idx = np.argmax(sound_char_tensor)
    for c, i in SOUND_WORD_DIC.items():
        if i==sound_char_idx:
            return c

In [5]:
x_p = np.array(x_phonosem_values)

y_0 = sound_words_to_1_hot_tensors(x_sound_words, WORD_LENGTH,0)
y_1 = sound_words_to_1_hot_tensors(x_sound_words, WORD_LENGTH,1)
y_2 = sound_words_to_1_hot_tensors(x_sound_words, WORD_LENGTH,2)
y_3 = sound_words_to_1_hot_tensors(x_sound_words, WORD_LENGTH,3)
y_4 = sound_words_to_1_hot_tensors(x_sound_words, WORD_LENGTH,4)
y_5 = sound_words_to_1_hot_tensors(x_sound_words, WORD_LENGTH,5)
y_6 = sound_words_to_1_hot_tensors(x_sound_words, WORD_LENGTH,6)
y_7 = sound_words_to_1_hot_tensors(x_sound_words, WORD_LENGTH,7)
y_8 = sound_words_to_1_hot_tensors(x_sound_words, WORD_LENGTH,8)


print('phomosem shape:', x_p.shape)

phomosem shape: (8002, 25)


## Модель

In [6]:
input = Input((PHONOSEM_VALUES_LEN,))
x = Dense(2560)(input)
for i in range(10):
    x = Dense(2560)(x)

p = 2
hp = 1024
output = []

for w in range(WORD_LENGTH):
    for i in range(p):
        x = Dense(hp)(x)
    output_x = Dense(SOUND_VOCAB_LEN)(x)
    output.append(output_x)

model = Model(inputs=input, outputs=output)

sgd = optimizers.SGD(lr=0.01, momentum=0.0, decay=0.0, nesterov=False)
adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8, decay=0.0)
model.compile(loss='mean_absolute_error', optimizer=sgd, metrics=['accuracy'])

model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 25)            0                                            
____________________________________________________________________________________________________
dense_1 (Dense)                  (None, 2560)          66560       input_1[0][0]                    
____________________________________________________________________________________________________
dense_2 (Dense)                  (None, 2560)          6556160     dense_1[0][0]                    
____________________________________________________________________________________________________
dense_3 (Dense)                  (None, 2560)          6556160     dense_2[0][0]                    
___________________________________________________________________________________________

## Тренировка модели

In [7]:
x_train, x_val, y_0_train, y_0_val, y_1_train, y_1_val, y_2_train, y_2_val, y_3_train, y_3_val, y_4_train, y_4_val, y_5_train, y_5_val, y_6_train, y_6_val, y_7_train, y_7_val, y_8_train, y_8_val = train_test_split(x_p, y_0, y_1, y_2, y_3, y_4, y_5, y_6, y_7, y_8, test_size=0.10, random_state=1)

model.fit(x_train, [y_0_train, y_1_train, y_2_train, y_3_train, y_4_train, y_5_train, y_6_train, y_7_train, y_8_train], verbose=0, epochs=500, validation_data=(x_val, [y_0_val, y_1_val, y_2_val, y_3_val, y_4_val, y_5_val, y_6_val, y_7_val, y_8_val]), batch_size=10)

W0318 21:14:59.792827 140507994244928 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:601: calling Constant.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


<keras.callbacks.History at 0x7fc98b1d4f28>

In [8]:
val = [x_p[55]]

p = model.predict(np.array(val))

print('value:', val)

for i in range(WORD_LENGTH):
    print('>>',sound_char_tensor_to_sound_char(p[i]))

value: [array([0.43937333, 0.39482667, 0.53562333, 0.59477167, 0.53691667,
       0.44590833, 0.39348833, 0.402895  , 0.48006   , 0.49689   ,
       0.45056667, 0.51067167, 0.49299833, 0.47493833, 0.51727167,
       0.40991833, 0.47905667, 0.45886167, 0.43612667, 0.46582833,
       0.5405    , 0.41077833, 0.46115833, 0.36687333, 0.48788167])]
>> в
>> а
>> ч
>> т
>> ё'
>> х
>> н
>> а
>> я


In [9]:
model.save('./current_adjf_9_scaled.h5')

## Тест

In [10]:
from keras.models import load_model
model = load_model('./current_adjf_9_scaled.h5')

Using TensorFlow backend.
W0318 23:11:54.250297 140677512243008 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:601: calling Constant.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [11]:
import numpy as np

def print_sound_word(sound_chars_tensor):
    for i in range(len(sound_chars_tensor)):
        print(sound_char_tensor_to_sound_char(sound_chars_tensor[i]), end='')
    print()
    
def generate_phonetics():
    ret = np.random.rand(PHONOSEM_VALUES_LEN)
    return ret

### Плохое прилагательное

In [12]:
phonetics = generate_phonetics()
phonetics[0] = 0.99
val = [ phonetics ]
print('value:', val)
sound_word = model.predict(np.array(val))
print_sound_word(sound_word)

value: [array([0.99      , 0.36503459, 0.74253885, 0.8532546 , 0.79126841,
       0.63011117, 0.62784541, 0.7330594 , 0.36323942, 0.13561595,
       0.56336333, 0.72990685, 0.1318455 , 0.45312354, 0.09374759,
       0.53301645, 0.51584652, 0.76485056, 0.71417195, 0.31599311,
       0.34913228, 0.96644598, 0.80190428, 0.8368579 , 0.42206441])]
к'од'су'к'т'ый


### Доброе прилагательное

In [13]:
phonetics = generate_phonetics()
phonetics[22] = 0.01
val = [ phonetics ]
print('value:', val)
sound_word = model.predict(np.array(val))
print_sound_word(sound_word)

value: [array([0.55289766, 0.28153077, 0.86213694, 0.75548432, 0.75718696,
       0.5001844 , 0.91437327, 0.39326511, 0.6808949 , 0.82507183,
       0.82368871, 0.37707912, 0.59247035, 0.7176963 , 0.3525429 ,
       0.82208437, 0.53211653, 0.60457011, 0.64346872, 0.36681275,
       0.39868039, 0.82482807, 0.01      , 0.66749165, 0.40922422])]
згю'в'у'п'ный
