In [0]:
from google.colab import drive
drive.mount('/content/drive/')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive/


In [0]:
items_sport = []
with open('/content/drive/My Drive/slots/sport.csv') as file:
  items_sport = [slot.replace('\n', '') for slot in file]

In [0]:
import itertools
from collections import namedtuple

"""Data sampler for slotfilling as Cartesian product of slots.
"""


__all__ = [
    'PhraseSlot',
    'make_samples'
]


class PhraseSlot(namedtuple('PhraseSlot', ['phrase', 'slot'])):
    """
    This class represents named tuple with two records: phrase and slot.
    This wrapper over namedtuple is needed only for initialization with default values.
    """
    
    __slots__ = ()
    def __new__(self, phrase, slot=None):
        assert type(phrase) is str, f'{phrase} must be of type `str`, got {type(phrase)}'
        phrase = phrase.strip()
        assert '' != phrase, f'phrase must be non empty'
        if slot is not None:
            assert type(slot) is str, f'{slot} must be of type `str`, got {type(slot)}'
            slot = slot.strip()
            assert '' != slot, f'slot must be non empty'
        
        return super(PhraseSlot, self).__new__(self, phrase, slot)


def make_slot(phrase_slots, sep=' '):
    """
    Makes pair of phrase and list of slots, where phrase is string and
    slot is dict with the following structure:
    
        {'start': start position of slot,
         'end': end position of slot,
         'title': name of slot,
         'text': slot value}
    
    inputs:
        phrase_slots: list of PhraseSlot
        sep: separator between phrases when concatenating
    outputs: pair of text and slots
    """
    
    current_length = 0
    slots = list()
    for phrase, slot in phrase_slots:
        start_position = current_length
        current_length += len(phrase) + len(sep)
        if slot is not None:
            slot_dict = dict()
            slot_dict['start'] = start_position
            slot_dict['end'] = start_position + len(phrase)
            slot_dict['title'] = slot
            slot_dict['text'] = phrase
            slots.append(slot_dict)
            
    return sep.join([phrase for phrase, _ in phrase_slots]), slots


def make_samples(*phrase_slots):
    """
    Makes list of samples for slotfilling from lists *phrase_slots as descartes product

    Example usage:
        begin_phrases = [PhraseSlot(phrase='я хочу купить'),
                    PhraseSlot(phrase='где купить')]
        slots = [PhraseSlot(phrase='велик', slot='Item'),
                PhraseSlot(phrase='велосипед', slot='Item')]
        prices = [PhraseSlot(phrase='60к', slot='Price')]

        samples = make_samples(begin_phrases, slots, prices)
    """
    
    return [make_slot(phrase_slot) for phrase_slot in itertools.product(*phrase_slots)]

In [0]:
import numpy as np
!pip install pymorphy2
import pymorphy2 as pmh

Collecting pymorphy2
[?25l  Downloading https://files.pythonhosted.org/packages/a3/33/fff9675c68b5f6c63ec8c6e6ff57827dda28a1fa5b2c2d727dffff92dd47/pymorphy2-0.8-py2.py3-none-any.whl (46kB)
[K    22% |███████                         | 10kB 16.2MB/s eta 0:00:01[K    44% |██████████████▏                 | 20kB 5.0MB/s eta 0:00:01[K    66% |█████████████████████▎          | 30kB 7.1MB/s eta 0:00:01[K    88% |████████████████████████████▍   | 40kB 4.5MB/s eta 0:00:01[K    100% |████████████████████████████████| 51kB 4.7MB/s 
Collecting dawg-python>=0.7 (from pymorphy2)
  Downloading https://files.pythonhosted.org/packages/6a/84/ff1ce2071d4c650ec85745766c0047ccc3b5036f1d03559fd46bb38b5eeb/DAWG_Python-0.7.2-py2.py3-none-any.whl
Collecting pymorphy2-dicts<3.0,>=2.4 (from pymorphy2)
[?25l  Downloading https://files.pythonhosted.org/packages/02/51/2465fd4f72328ab50877b54777764d928da8cb15b74e2680fc1bd8cb3173/pymorphy2_dicts-2.4.393442.3710985-py2.py3-none-any.whl (7.1MB)
[K    100% |

In [0]:
def one_hot(n, ind):
  arr = np.zeros(n)
  arr[ind] = 1.0
  return arr
classes_names = [
    "Item begin",
    "Item inside",
    "Price from begin",
    "Price from inside",
    "Price to begin",
    "Price to inside",
    "Price begin",
    "Price inside",
    "Cashback begin",
    "Cashback inside",
    "Attribute begin",
    "Attribute inside",
    "<Stop>"
]
num_classes = len(classes_names)
slots_dict = {classes_names[i] : one_hot(num_classes, i) for i in range(num_classes)}
slots_dict_rev = {v.argmax():k for k, v in slots_dict.items()}

In [0]:
from gensim.models import word2vec
from functools import reduce

In [0]:
import keras
from keras.layers import Bidirectional, LSTM, TimeDistributed, Dense, Conv1D, Conv2D
from keras.layers import Activation, Input, Embedding, Dropout
from keras.models import Sequential, Model

Using TensorFlow backend.


In [0]:
!pip install git+https://www.github.com/keras-team/keras-contrib.git

Collecting git+https://www.github.com/keras-team/keras-contrib.git
  Cloning https://www.github.com/keras-team/keras-contrib.git to /tmp/pip-req-build-fzj7okuu
Building wheels for collected packages: keras-contrib
  Building wheel for keras-contrib (setup.py) ... [?25ldone
[?25h  Stored in directory: /tmp/pip-ephem-wheel-cache-ivgptdan/wheels/11/27/c8/4ed56de7b55f4f61244e2dc6ef3cdbaff2692527a2ce6502ba
Successfully built keras-contrib
Installing collected packages: keras-contrib
Successfully installed keras-contrib-2.0.8


In [0]:
from keras_contrib.layers import CRF

In [0]:
#begin_phrases = [PhraseSlot(phrase='я хочу купить', slot='Begin'),
#                    PhraseSlot(phrase='где купить', slot='Begin')]
slots = [
    PhraseSlot(phrase='велик', slot='Item'),
    PhraseSlot(phrase='велосипед', slot='Item'),
    PhraseSlot(phrase='лыжи', slot='Item'),
    PhraseSlot(phrase='коньки', slot='Item'),
    PhraseSlot(phrase='сноуборд', slot='Item'),
    PhraseSlot(phrase='борд', slot='Item'),
    PhraseSlot(phrase='байк', slot='Item'),
    PhraseSlot(phrase='лыжные штаны', slot='Item')
    
]
slots += [
    PhraseSlot(phrase=text, slot='Item') for text in items_sport[:50]
]
prices = [
    PhraseSlot(phrase='до ###', slot='Price to'),
    PhraseSlot(phrase='за ###', slot='Price'),
    PhraseSlot(phrase='от ###', slot='Price from'),
    PhraseSlot(phrase='### к', slot='Price'),
    PhraseSlot(phrase='до ### к', slot='Price to'),
    PhraseSlot(phrase='за ### к', slot='Price'),
    PhraseSlot(phrase='от ### к', slot='Price from')
]
cashback = [
    PhraseSlot(phrase='кэшбек', slot='Cashback'),
    PhraseSlot(phrase='кэшбек ### %', slot='Cashback'),
    PhraseSlot(phrase='### % кэшбек', slot='Cashback'),
    PhraseSlot(phrase='### %', slot='Cashback'),
    PhraseSlot(phrase='### процент', slot='Cashback'),
    PhraseSlot(phrase='### процент кэшбек', slot='Cashback'),
    PhraseSlot(phrase='кэшбек ###', slot='Cashback'),
    PhraseSlot(phrase='кэшбек ### процент', slot='Cashback'),
    PhraseSlot(phrase='кэшбек ###', slot='Cashback')
]

samples = make_samples(slots, prices, cashback)

In [0]:
all_texts = [slot[0].split(' ') for slot in samples]

In [0]:
all_texts

[['велик', 'до', '###', 'кэшбек'],
 ['велик', 'до', '###', 'кэшбек', '###', '%'],
 ['велик', 'до', '###', '###', '%', 'кэшбек'],
 ['велик', 'до', '###', '###', '%'],
 ['велик', 'до', '###', '###', 'процент'],
 ['велик', 'до', '###', '###', 'процент', 'кэшбек'],
 ['велик', 'до', '###', 'кэшбек', '###'],
 ['велик', 'до', '###', 'кэшбек', '###', 'процент'],
 ['велик', 'до', '###', 'кэшбек', '###'],
 ['велик', 'за', '###', 'кэшбек'],
 ['велик', 'за', '###', 'кэшбек', '###', '%'],
 ['велик', 'за', '###', '###', '%', 'кэшбек'],
 ['велик', 'за', '###', '###', '%'],
 ['велик', 'за', '###', '###', 'процент'],
 ['велик', 'за', '###', '###', 'процент', 'кэшбек'],
 ['велик', 'за', '###', 'кэшбек', '###'],
 ['велик', 'за', '###', 'кэшбек', '###', 'процент'],
 ['велик', 'за', '###', 'кэшбек', '###'],
 ['велик', 'от', '###', 'кэшбек'],
 ['велик', 'от', '###', 'кэшбек', '###', '%'],
 ['велик', 'от', '###', '###', '%', 'кэшбек'],
 ['велик', 'от', '###', '###', '%'],
 ['велик', 'от', '###', '###', 'проц

In [0]:
uniq_words = []
for sentence in all_texts:
  for word in sentence:
    uniq_words.append(word)

uniq_words = list(set(uniq_words))
words_dict = {uniq_words[i] : i for i in range(len(uniq_words))}

In [0]:
w2v_size = 100

In [0]:
w2v = word2vec.Word2Vec(all_texts, size=w2v_size, window=3, workers=4, min_count=1)

In [0]:
w2v_dict = dict(zip(w2v.wv.index2word, w2v.wv.vectors))

In [0]:
w2v_dict.keys()

dict_keys(['###', 'кэшбек', 'к', '%', 'процент', 'до', 'за', 'от', 'велосипед', 'и', 'восстановители', 'одежда', 'велик', 'лыжи', 'коньки', 'сноуборд', 'борд', 'байк', 'лыжные', 'штаны', 'аксессуары', 'носки', 'посуда', 'посттренировочные', 'комплексы', 'anon', 'хондроитин', 'женская', 'roxy', 'подарочные', 'сертификаты', 'термобелье', 'активное', 'долголетие', 'веломайки', 'аминокислоты', 'каски', 'спортивные', 'блоки-ролики', 'полиспасты', 'losraketos', 'точки', 'страховки', 'thule', 'детские', 'amplifi', 'spy+', 'ночные', 'привязи', 'еда', 'шоссейный', 'треккинговые', 'палки', 'самокат', 'диетическое', 'питание', 'страховочные', 'системы', 'чехлы', '686', 'bonus', 'gloves', 'жилеты', 'сандалии', 'куртки', 'циклокросс', 'oakley', 'детский', 'свитера', 'сноуборды', 'балаклавы', 'бафы', 'фонари', 'vonzipper', 'электровелосипед', 'разгрузки', 'креатин', 'мужская', 'бахилы', 'веревки', 'городская', 'обувь'])

***Готовим входные векторы:***

In [0]:
train_samples = samples[:int(0.7*len(samples))]
test_samples = samples[int(0.7*len(samples)+1):]

In [0]:
analyzer = pmh.MorphAnalyzer()

In [0]:
uniq_grams = []
for sample in samples:
  for word in sample[0].split(' '):
    uniq_grams.append(analyzer.parse(word)[0].tag.POS)

In [0]:
uniq_grams = list(set(uniq_grams))

In [0]:
uniq_grams

['PREP', None, 'NOUN', 'ADJF', 'CONJ']

In [0]:
grams_dict = {uniq_grams[i] : one_hot(len(uniq_grams) + 1, i) for i in range(len(uniq_grams))}

In [0]:
grams_dict['<Stop>'] = one_hot(len(uniq_grams) + 1, len(uniq_grams))
grams_dict

{'<Stop>': array([0., 0., 0., 0., 0., 1.]),
 'ADJF': array([0., 0., 0., 1., 0., 0.]),
 'CONJ': array([0., 0., 0., 0., 1., 0.]),
 'NOUN': array([0., 0., 1., 0., 0., 0.]),
 None: array([0., 1., 0., 0., 0., 0.]),
 'PREP': array([1., 0., 0., 0., 0., 0.])}

In [0]:
words_dict['stop'] = len(words_dict)

In [0]:
X_words = []
X_morph = []
labels = []
X_emb = []
for sample in train_samples:
  sentence_x = []
  embedding_x = []
  morph_x = []
  sentence_y = []
  for slot in sample[1]:
    words = slot['text'].split(' ')
    title = slot['title']
    count = 0
    for word in words:
      sentence_x.append(w2v_dict[word])
      embedding_x.append(words_dict[word])
      morph_x.append(grams_dict[analyzer.parse(word)[0].tag.POS])
      if(count == 0):
        sentence_y.append(np.array(slots_dict[title + " begin"]))
      else:
        sentence_y.append(np.array(slots_dict[title + " inside"]))
      count += 1
  while len(embedding_x) < 14:
    embedding_x.append(np.array([words_dict['stop']]))
    sentence_y.append(slots_dict['<Stop>'])
    morph_x.append(grams_dict['<Stop>'])
  sentence_x, sentence_y, embedding_x = np.array(sentence_x), np.array(sentence_y), np.array(embedding_x)
  X_words.append(sentence_x)
  X_emb.append(embedding_x)
  X_morph.append(morph_x)
  labels.append(sentence_y)

In [0]:
lengths = [sentence.shape[0] for sentence in X_emb]
max_length = max(lengths)
min_length = min(lengths)

In [0]:
max_length

14

In [0]:
X_words = np.array(X_words)
X_emb = np.array([X_emb])
X_morph = np.array(X_morph)
labels = np.array(labels)

***Model***

In [0]:
from keras.layers import Reshape

In [0]:
#input_word_layer = Input(shape=(None, w2v_size))
input_word_layer = Input(shape=(max_length,))
embed_word_layer = Embedding(1, 16, input_length=max_length)(input_word_layer)
input_morph_layer = Input(shape=(max_length, len(grams_dict)))
word_lstm = Bidirectional(LSTM(64, return_sequences=True))(embed_word_layer)
morph_lstm = Bidirectional(LSTM(16, return_sequences=True))(input_morph_layer)
concat = keras.layers.concatenate([word_lstm, morph_lstm])
lstm = Bidirectional(LSTM(32, return_sequences=True))(concat)
conv = Conv1D(filters=1, kernel_size=(3), padding='same')(lstm)
dense1 = Dense(16)(conv)
act1 = Activation('relu')(dense1)
crf = CRF(num_classes)
out = crf(act1)

model = Model([input_word_layer, input_morph_layer], [out])

In [0]:
model.compile(
    loss=crf.loss_function,
    optimizer='adam',
    metrics=[crf.accuracy]
)



In [0]:
X_emb[0][1].shape

(14,)

In [0]:
#model.fit([np.array([X_words[0]]), np.array([X_morph[0]])], np.array([Y[0]]), verbose=2, epochs=1000, shuffle=True)
model.fit([X_emb[0], X_morph], [labels], verbose=1, epochs=100, batch_size=None)

Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x7f75ebe55be0>

Evaluation:

In [0]:
#model.evaluate([np.array([X_words[0]]), np.array([X_morph[0]])], np.array([Y[0]]))

In [0]:
len(test_samples)

1096

In [0]:
test_sentence = []
test_morph = []
true_y = []
emb_x = []
sample = test_samples[516]
text = sample[0]
#print(samples[12054])
for slot in sample[1]:
    words = slot['text'].split(' ')
    title = slot['title']
    count = 0
    for word in words:
      test_sentence.append(w2v_dict[word])
      emb_x.append(np.array(words_dict[word]))
      test_morph.append(grams_dict[analyzer.parse(word)[0].tag.POS])
      if(count == 0):
        true_y.append(slots_dict[title + " begin"])
      else:
        true_y.append(slots_dict[title + " inside"])
      count += 1
while len(emb_x) < max_length:
  emb_x.append(np.array([words_dict['stop']]))
  true_y.append(slots_dict['<Stop>'])
  test_morph.append(grams_dict['<Stop>'])

In [0]:
print(emb_x)

[array(9), array(52), array(40), array(53), array(42), array(6), array(42), array(17), array(2), array([82]), array([82]), array([82]), array([82]), array([82])]


In [0]:
print(text)
print(true_y)

балаклавы и бафы за ### к ### процент кэшбек
[array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])]


In [0]:
#prediction = model.predict([np.array([test_sentence]), np.array([test_morph])])[0]
prediction = model.predict([np.array([emb_x]), np.array([test_morph])])[0]

In [0]:
list(zip([slots_dict_rev[vec.argmax()] for vec in prediction], [slots_dict_rev[vec.argmax()] for vec in true_y]))

[('Item begin', 'Item begin'),
 ('Price begin', 'Item inside'),
 ('Price inside', 'Item inside'),
 ('Cashback begin', 'Price begin'),
 ('Cashback inside', 'Price inside'),
 ('<Stop>', 'Price inside'),
 ('<Stop>', 'Cashback begin'),
 ('<Stop>', 'Cashback inside'),
 ('<Stop>', 'Cashback inside'),
 ('<Stop>', '<Stop>'),
 ('<Stop>', '<Stop>'),
 ('<Stop>', '<Stop>'),
 ('<Stop>', '<Stop>'),
 ('<Stop>', '<Stop>')]