# Генерирование имен с помощью нейронных сетей

## Модели биграм в NLTK

In [None]:
# Получим данные
!wget -dinos.txt "https://www.dropbox.com/s/e0v7ex10s5kfu0y/dinos.txt?dl=1"

In [1]:
import nltk

from sklearn.utils import shuffle

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import LSTM, TimeDistributed, Bidirectional
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, Flatten, Dropout

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
names = [name.strip().lower() for name in open('dinos.txt').readlines()]
print(names[:10])

['aachenosaurus', 'aardonyx', 'abdallahsaurus', 'abelisaurus', 'abrictosaurus', 'abrosaurus', 'abydosaurus', 'acanthopholis', 'achelousaurus', 'acheroraptor']


In [3]:
chars = [char  for name in names for char in name]
freq = nltk.FreqDist(chars)

print(list(freq.keys()))

['r', 'm', 't', 'o', 'g', 's', 'v', 'p', 'j', 'i', 'a', 'l', 'd', 'f', 'q', 'x', 'k', 'b', 'z', 'y', 'h', 'w', 'e', 'n', 'u', 'c']


In [4]:
cfreq = nltk.ConditionalFreqDist(nltk.bigrams(chars))
cfreq['a']

FreqDist({'a': 26,
          'b': 32,
          'c': 109,
          'd': 41,
          'e': 48,
          'f': 7,
          'g': 44,
          'h': 21,
          'i': 26,
          'j': 8,
          'k': 22,
          'l': 146,
          'm': 74,
          'n': 354,
          'o': 27,
          'p': 96,
          'q': 3,
          'r': 131,
          's': 187,
          't': 213,
          'u': 792,
          'v': 34,
          'w': 10,
          'x': 12,
          'y': 14,
          'z': 10})

In [5]:
cprob = nltk.ConditionalProbDist(cfreq, nltk.MLEProbDist)
print('p(a a) = %1.4f' %cprob['a'].prob('a'))
print('p(a b) = %1.4f' %cprob['a'].prob('b'))
print('p(a u) = %1.4f' %cprob['a'].prob('u'))

p(a a) = 0.0105
p(a b) = 0.0129
p(a u) = 0.3185


In [6]:
from math import log
log(cprob['a'].prob('a')) + log(cprob['a'].prob('b')) + log(cprob['a'].prob('c'))

-12.041317008359863

In [7]:
l = sum([freq[char] for char in freq])
def unigram_prob(char):
    return freq[char] / l
print('p(a) = %1.4f' %unigram_prob('a'))

p(a) = 0.1354


In [8]:
[bi for bi in nltk.bigrams('aachenosaurus')]

[('a', 'a'),
 ('a', 'c'),
 ('c', 'h'),
 ('h', 'e'),
 ('e', 'n'),
 ('n', 'o'),
 ('o', 's'),
 ('s', 'a'),
 ('a', 'u'),
 ('u', 'r'),
 ('r', 'u'),
 ('u', 's')]

#### Задание 1

1. Напишите функцию для оценки вероятности имени динозавра. 
2. Найдите наиболее вероятное имя динозавра из данного списка. 

In [9]:
cprob["a"].generate()

'u'

In [10]:
# решение 1.1
def name_prob(name):
    p = unigram_prob(name[0])
    for i in range(len(name)-1):
        p *=  cprob[name[i]].prob(name[i+1])
    return p

name_prob(names[0])

2.0222358416238476e-10

#### Задание 2

Напишите функцию для генерации нового имени динозавра фиксированной длины.

In [11]:
# решение 2
def generate_name(cprob, first_char, num_chars):
    name = ''
    name += first_char
    for i in range(num_chars):
        char = cprob[first_char].generate()
        name += char
        first_char = char
    return name

generate_name(cprob, 't', 9)

'tonrurodha'

## Нейронные языковые модели

* Вход: $n$-грамы $w_{1:k}$
* $v(w_i)$ – эмбеддинг слова $w_i$, $v(w_i) \in \mathbb{R}^{d_{emb}}$, $d_{emb}$ – размерность эмбеддинга, $v(w) = E_{[w]}$
* $x = [v(w_1), v(w_2), \ldots , v(w_k)]$

$\widehat{y} = P(w_i | w_{1:k} ) = \texttt{LM}(w_{1:k}) = \texttt{softmax}(hW^2 +b^2)$

$h = g(xW^1+b^1)$

$w_i \in V$, $E \in \mathbb{R}^{|V|\times d_{emb}}, W^1 \in \mathbb{R}^{k \cdot d_{emb} \times d_{hid}}, b^1 \in \mathbb{R} ^ {d_{hid}}, W^2 \in \mathbb{R}^{d_{hid} \times |V|}, b^2 \in \mathbb{R} ^ {|V|}$

### Семплирование в нейронных языковых моделях 
### (Генерация текстов с помощью нейронных языковых моделей)

1. Задать начальную последовательность символов длины $k$ (/слов)
2. Предсказать распределение вероятностей слов с условием на $k$ предыдущих слов
3. 1. Выбрать слово с наибольшей вероятностью
3. 2. Выбрать слово по предсказаному распределению
4. Сдвинуть окно на одно слово и повторить 

#### Линейный поиск  (beam search)
Всегда помним $h$ наиболее вероятных гипотез:
1. Для генерации первого слова в последоватительности генерируем $h$ кандидатов, а не 1
2. Генерируем $h \times h$ кандидатов для второго слова и храним только $h$ наиболее вероятных


In [12]:
alphabet = list(set(chars))
print('total chars:', len(alphabet))

total chars: 26


In [13]:
maxlen = 5
step = 1
ngrams = []
next_chars = []
for name in names:
    for i in range(0, len(name) - maxlen, step):
        ngrams.append(' '.join([char for char in name[i: i + maxlen]]))
        next_chars.append(name[i + maxlen])
print('nb ngrams:', len(ngrams))
print(ngrams[0],next_chars[0])
print(ngrams[1],next_chars[1])

nb ngrams: 10701
a a c h e n
a c h e n o


In [14]:
tokenizer = Tokenizer(num_words=len(alphabet))
tokenizer.fit_on_texts(ngrams)

sequences = tokenizer.texts_to_sequences(ngrams)
X_train = pad_sequences(sequences, maxlen=maxlen)
sequences = tokenizer.texts_to_sequences(next_chars)
y_train = tokenizer.sequences_to_matrix(sequences)
X_train[0]

array([ 1,  1, 12, 11,  7], dtype=int32)

In [15]:
y_train[0]

array([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [16]:
char_index = tokenizer.word_index
index_char = {i: c for c, i in char_index.items()}

In [17]:
model = Sequential()

model.add(Embedding(len(alphabet), 50, input_length=maxlen))
model.add(Flatten())
model.add(Dense(64, activation = 'softmax'))
model.add(Dropout(0.2))
model.add(Dense(32, activation = 'softmax'))
model.add(Dropout(0.2))
model.add(Dense(len(alphabet), activation = 'softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [18]:
for iteration in range(1, 100):
    X_train_shuffled, y_train_shuffled = shuffle(X_train,y_train)
    model.fit(X_train_shuffled, y_train_shuffled, batch_size=len(X_train), epochs=1, verbose = 0)

In [19]:
def sample(preds):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) #/ temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.choice(range(len(alphabet)), p = preds)
    return probas

In [20]:
import random
import sys
import numpy as np

generated = ''
seed = 'anton'
generated += seed
print('----- Generating with seed: "' + seed + '"')
print(generated)

for i in range(8):
    sequences = tokenizer.texts_to_sequences([' '.join([char for char in generated[-maxlen:]])])
    X_pred = pad_sequences(sequences, maxlen=maxlen)
    preds = model.predict(X_pred, verbose=0)[0]
    next_index = sample(preds)
    next_char = index_char[next_index]
    generated += next_char
    print(generated)

----- Generating with seed: "anton"
anton
antons
antonsy
antonsyr
antonsyrn
antonsyrnc
antonsyrncj
antonsyrncje
antonsyrncjew


#### Задание 3

Измените код выше так, чтобы генерировались панграмы – имена динозавров, не содержащие повторяющихся букв

In [21]:
# решение 3
for i in range(5):
    sequences = tokenizer.texts_to_sequences([' '.join([char for char in generated[-maxlen:]])])
    X_pred = pad_sequences(sequences, maxlen=maxlen)
    preds = model.predict(X_pred, verbose=0)[0]
    next_index = sample(preds)
    next_char = index_char[next_index]
    if next_char not in generated:
        generated += next_char
        print(generated)
    else:
        preds[next_index] = 10e-5
        next_index = sample(preds)
        next_char = index_char[next_index]
        generated += next_char
        print(generated)

antonsyrncjewm
antonsyrncjewms
antonsyrncjewmsx
antonsyrncjewmsxp
antonsyrncjewmsxpz


#### Задание 4

Измените функцию семлирования `sample`: добавьте параметр `t`, изпольузуемый для шкалирования вероятностей  `preds`: ```
preds /= t
``` 

Как использование этого параметра влияет на генерируемые имена?

In [22]:
# решение 4
def sample(preds, t=1.2):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / t
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.choice(range(len(alphabet)), p = preds)
    return probas

In [23]:
# решение 4
for i in range(5):
    sequences = tokenizer.texts_to_sequences([' '.join([char for char in generated[-maxlen:]])])
    X_pred = pad_sequences(sequences, maxlen=maxlen)
    preds = model.predict(X_pred, verbose=0)[0]
    next_index = sample(preds)
    next_char = index_char[next_index]
    if next_char not in generated:
        generated += next_char
        print(generated)
    else:
        preds[next_index] = 10e-5
        next_index = sample(preds)
        next_char = index_char[next_index]
        generated += next_char
        print(generated)

antonsyrncjewmsxpzl
antonsyrncjewmsxpzlg
antonsyrncjewmsxpzlgf
antonsyrncjewmsxpzlgfy
antonsyrncjewmsxpzlgfyd


### Рекуррентные нейронные языковые модели

RNN позволяют уйти от Марковских допущений и позволяют учитывать предысторию произвольной длины.

$x_{1:n} = x_1, x_2, \ldots, x_n$, $x_i \in \mathbb{R}^{d_{in}}$

$y_n = RNN(x_{1:n})$, $y_n \in \mathbb{R}^{d_{out}}$

Для каждого префикса $x_{i:i}$ $y_i$ – выходной вектор.

$y_i = RNN(x_{1:i})$

$y_{1:n} = RNN^{*}(x_{1:n})$, $y_i \in \mathbb{R}^{d_{out}}$

In [24]:
from keras.utils import to_categorical
import numpy as np

X_names = ['bos ' + ' '.join(name) for name in names]
Y_names = [' '.join(name) + ' eos' for name in names]
maxlen = max([len(name) for name in names])+1

In [25]:
tokenizer = Tokenizer(num_words=len(alphabet)+2)
tokenizer.fit_on_texts(X_names+Y_names)

sequences = tokenizer.texts_to_sequences(X_names)
X_train = pad_sequences(sequences, maxlen=maxlen, padding='post')


sequences = tokenizer.texts_to_sequences(Y_names)
Y_train = pad_sequences(sequences, padding='post')


Y_train_cat  = [to_categorical(sent, num_classes=len(alphabet)+2) for sent in Y_train]
Y_train =  np.asarray(Y_train_cat)

In [26]:
print(X_names[0])
print(Y_names[0])


print(X_train.shape)
print(Y_train.shape)

print(tokenizer.word_index['bos'])
print(tokenizer.word_index['eos'])

bos a a c h e n o s a u r u s
a a c h e n o s a u r u s eos
(1536, 27)
(1536, 27, 28)
10
11


In [27]:
char_index = tokenizer.word_index
index_char = {i: c for c, i in char_index.items()}

In [28]:
model = Sequential()

model.add(Embedding(len(alphabet)+2, 30, input_length=maxlen))
model.add(LSTM(128, return_sequences = True))

model.add(Dense(len(alphabet)+2, activation = 'softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [29]:
for iteration in range(1, 20):
    X_train_shuffled, y_train_shuffled = shuffle(X_train, Y_train)
    model.fit(X_train_shuffled, y_train_shuffled, batch_size=len(X_train), epochs=1, verbose = 1)

Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1


In [30]:
def sample(preds):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) #/ temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.choice(range(len(alphabet)+2), p = preds)
    return probas

In [31]:
generated = ''
seed = 'bos'
generated += seed + ' '
print('----- Generating with seed: "' + seed + '"')
print(generated)


for i in range(7): 
    sequences = tokenizer.texts_to_sequences([seed])
    X_pred = pad_sequences(sequences, maxlen=maxlen, padding = 'post')

    preds = model.predict(X_pred, verbose=0)[0]
    samples = [sample(p) for p in preds]
    next_index = samples[i]
    while next_index == 0 or next_index == 10:
        samples = [sample(p) for p in preds]
        next_index = samples[i]
    next_char = index_char[next_index]
    generated += next_char + ' '
    print(generated)
    seed += next_char
    if next_char == 'eos':
        break
    

----- Generating with seed: "bos"
bos 
bos o 
bos o u 
bos o u j 
bos o u j f 
bos o u j f j 
bos o u j f j e 
bos o u j f j e k 
