### Data preprocessing

In [1]:
import spacy
spacy_xx = spacy.load('xx_ent_wiki_sm')

In [2]:
with open('fenomen.txt', 'r', encoding="utf-8") as f:
    data = f.read()

In [3]:
import re
data = data.replace('\n', ' ').replace('a', 'а').strip()
sents = data.split('.')
data = " EOS ".join(sents) #["START " + s for s in sents]
data = re.sub(' +',' ', data) # just to handle this naughty spaces

In [4]:
data[:250]

'Для нас дух имеет своей предпосылкой природу, он является ее истиной, и тем самым абсолютно первым в отношении ее EOS В этой истине природа исчезла, и дух обнаружился в ней как идея, достигшая своего для-себя-бытия, - как идея, объект которой, так же'

In [5]:
def create_wordlist(doc):
    wl = []
    for word in doc:
        if word.text not in ("\n","\n\n",'\u2009','\xa0'):
            wl.append(word.text.lower())
    return wl

In [6]:
doc = spacy_xx(data)
wl = create_wordlist(doc)

In [8]:
wl

['для',
 'нас',
 'дух',
 'имеет',
 'своей',
 'предпосылкой',
 'природу',
 ',',
 'он',
 'является',
 'ее',
 'истиной',
 ',',
 'и',
 'тем',
 'самым',
 'абсолютно',
 'первым',
 'в',
 'отношении',
 'ее',
 'eos',
 'в',
 'этой',
 'истине',
 'природа',
 'исчезла',
 ',',
 'и',
 'дух',
 'обнаружился',
 'в',
 'ней',
 'как',
 'идея',
 ',',
 'достигшая',
 'своего',
 'для-себя-бытия',
 ',',
 '-',
 'как',
 'идея',
 ',',
 'объект',
 'которой',
 ',',
 'так',
 'же',
 'как',
 'и',
 'ее',
 'субъект',
 ',',
 'есть',
 'понятие',
 'eos',
 'это',
 'тождество',
 'есть',
 'абсолютная',
 'отрицательность',
 ',',
 'ибо',
 'в',
 'природе',
 'понятие',
 'обладает',
 'своей',
 'полной',
 'внешней',
 'объективностью',
 ',',
 'однако',
 'это',
 'его',
 'отчуждение',
 'становится',
 'тождественным',
 'с',
 'самим',
 'собой',
 'eos',
 'тем',
 'самым',
 'оно',
 'есть',
 'это',
 'тождество',
 'только',
 'как',
 'возвращение',
 'к',
 'себе',
 'из',
 'природы',
 'eos',
 'развитие',
 'духа',
 'состоит',
 'в',
 'том',
 ',',


In [9]:
import collections
word_counts = collections.Counter(wl)

N_most_common = 1500
wc_most_common = word_counts.most_common(n=N_most_common)

# Mapping from index to word : that's the vocabulary
vocabulary_inv = ["unknown"] + [x[0] for x in wc_most_common]
vocabulary_inv = list(sorted(vocabulary_inv))

# Mapping from word to index
vocab = {x: i for i, x in enumerate(vocabulary_inv)}
words = [x[0] for x in wc_most_common]

#size of the vocabulary
vocab_size = len(words)
assert vocab_size == N_most_common #however I also have "unknown" token
print("vocab size: ", vocab_size)

vocab size:  1500


Trying to avoid the situation when the network will try to predict a million-size vector of probabilities at each step, I just have cut the set of the words.

In [10]:
# this is just there were some 'a' (from English) characters; but they are not the same as from Cyrrilic glyph set
for w in vocabulary_inv:
    try:
        w.encode('ascii')
    except UnicodeEncodeError: 
        pass
    else:
        print(w)
        


"
(
)
,
-
--
1
2
3
:
;
=
_
eos
unknown


In [11]:
vocabulary_inv

['"',
 '(',
 ')',
 ',',
 '-',
 '--',
 '1',
 '2',
 '3',
 ':',
 ';',
 '=',
 '_',
 'eos',
 'unknown',
 'а',
 'абсолютная',
 'абсолютно',
 'абсолютное',
 'абсолютной',
 'абсолютный',
 'абстрагированной',
 'абстрактная',
 'абстрактно',
 'абстрактного',
 'абстрактное',
 'абстрактной',
 'абстрактном',
 'абстрактному',
 'абстрактным',
 'антропологии',
 'антропологического',
 'антропологичной',
 'аперцепций',
 'бедное',
 'без',
 'безусловно',
 'берущим',
 'бесконечного',
 'бесконечное',
 'бесконечному',
 'бесконечность',
 'бессилия',
 'бессильная',
 'бессодержательное',
 'бессознательное',
 'благодаря',
 'ближайшая',
 'ближайшей',
 'ближайшим',
 'бога',
 'богатое',
 'богатым',
 'боге',
 'божественного',
 'более',
 'болезненные',
 'больше',
 'большего',
 'большей',
 'борьба',
 'борьбой',
 'борьбу',
 'борьбы',
 'борющихся',
 'брать',
 'будто',
 'будучи',
 'будь',
 'бы',
 'бывает',
 'был',
 'была',
 'были',
 'было',
 'бытие',
 'бытии',
 'бытию',
 'бытия',
 'быть',
 'в',
 'в-себе-и-для-себя',
 'в-с

In [12]:
seq_length = 30
sequences = []
next_words = []
for i in range(0, len(wl) - seq_length):
    sequences.append(wl[i: i + seq_length])
    next_words.append(wl[i + seq_length])

All sequences are already tokenized and lowercased. Also 'eos' token is added to the end

In [None]:
eos_ix = vocab['eos']
unk_ix = vocab['unknown']

In [None]:
import numpy as np
def as_matrix(sequences, max_len=None):
    """ Convert a list of tokens into a matrix with padding """
    max_len = max_len or max(map(len,sequences))
    
    matrix = np.zeros((len(sequences), max_len), dtype='int32')
    for i,seq in enumerate(sequences):
        row_ix = [vocab.get(word, unk_ix) for word in seq[:max_len]]
        matrix[i, :len(row_ix)] = row_ix
    
    return matrix

In [71]:
as_matrix([["бытие"]])

array([[211]], dtype=int32)

In [16]:
#sanity check
as_matrix(sequences[0:3])

array([[ 250,  540,  295,  381, 1143,  910,  955,    3,  686, 1496,  321,
         407,    3,  360, 1360, 1122,   17,  787,   80,  758,  321,   13,
          80, 1486,  399,  943,  413,    3,  360,  295],
       [ 540,  295,  381, 1143,  910,  955,    3,  686, 1496,  321,  407,
           3,  360, 1360, 1122,   17,  787,   80,  758,  321,   13,   80,
        1486,  399,  943,  413,    3,  360,  295,  635],
       [ 295,  381, 1143,  910,  955,    3,  686, 1496,  321,  407,    3,
         360, 1360, 1122,   17,  787,   80,  758,  321,   13,   80, 1486,
         399,  943,  413,    3,  360,  295,  635,   80]], dtype=int32)

### Network


I will just build the language model (conditional probabilities distribution of words given some prefix sequence). To achieve this, I will just use plain LSTM RNN. To train it, I will use extracts from originl text of the length (say from 30 to 50) and use this model to predict next word given the sequence before. Actually, what I will do is just predicting probability of each word, so I will work with vocabulary (most popular 1500-2000 words) of constrained size. The loss function is cross-entropy.

In [22]:
import torch, torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

In [23]:
#the structure is partially borrowed from the exercises from https://github.com/yandexdataschool/Practical_DL/blob/master/homework04/part2_image_captioning.ipynb
class CaptionNet(nn.Module):
    def __init__(self, n_tokens=1501, emb_size=128, lstm_units=256, cnn_feature_size=2048):
        """ A recurrent 'head' network for sequence generation. """
        super(self.__class__, self).__init__()
        
        # create embedding for input words. Use the parameters (e.g. emb_size).
        self.emb = nn.Embedding(n_tokens, emb_size)
            
        self.lstm = nn.LSTM(emb_size, lstm_units, batch_first=True)
        self.logits = nn.Linear(lstm_units, n_tokens)
        
    def forward(self, captions_ix):
        """ 
        Apply the network in training mode. 
        :param captions_ix: a Variable containing captions as matrix. shape: [batch, word_i]. 
            padded with pad_ix
        :returns: logits for next token at each tick, shape: [batch, word_i, n_tokens]
        """
        initial_cell = torch.zeros((captions_ix.size(0), 256)).unsqueeze(0).cuda()
        initial_hid = torch.zeros((captions_ix.size(0), 256)).unsqueeze(0).cuda()

        captions_emb = self.emb(captions_ix)
        
        batch_size, caption_len, emb_size = captions_emb.size()
        caption_len = captions_emb.size()[1]
        len_list = [caption_len for i in range(captions_emb.size()[0])]
        input_seq = nn.utils.rnn.pack_padded_sequence(captions_emb, len_list, batch_first=True)
        
        lstm_out, _ = self.lstm(input_seq, (initial_hid, initial_cell))
        lstm_out = lstm_out.data.view(caption_len, batch_size, -1).permute(1, 0, 2)

        logits = self.logits(lstm_out)
        
        return logits      

In [20]:
n_tokens = 1501
network = CaptionNet(n_tokens)

In [24]:
def compute_loss(network, captions_ix):   
    # captions for input - all except last cuz we don't know next token for last one.
    captions_ix_inp = captions_ix[:, :-1].contiguous()
    captions_ix_next = captions_ix[:, 1:].contiguous()

    # apply the network, get predictions for captions_ix_next
    logits_for_next = network.forward(captions_ix_inp)
    
    batch_size, caption_len = captions_ix_next.size()
    
    criterion = nn.CrossEntropyLoss()
    loss = criterion(logits_for_next.view(batch_size*caption_len, -1), captions_ix_next.view(batch_size*caption_len))
    loss = loss.mean()
    return loss

In [22]:
network.cuda()

CaptionNet(
  (emb): Embedding(1501, 128)
  (lstm): LSTM(128, 256, batch_first=True)
  (logits): Linear(in_features=256, out_features=1501, bias=True)
)

Creating optimizer for the network

In [23]:
network.cuda()
opt = torch.optim.Adam(network.parameters())

Spliting data into train and val:

In [24]:
from sklearn.model_selection import train_test_split

sequences = np.asarray(sequences)
next_words = np.asarray(next_words)

train_captions, val_captions, train_ans, val_ans = train_test_split(sequences, next_words,
                                                                                test_size=0.1,
                                                                                random_state=42)

In [28]:
from random import choice

def generate_batch(captions, batch_size, max_caption_len=None):
    
    #sample random numbers for caption indicies
    random_ix = np.random.randint(0, len(captions), size=batch_size)

    captions_for_batch = captions[random_ix]
    
    #convert to matrix
    batch_captions_ix = as_matrix(captions_for_batch,max_len=max_caption_len).astype(int)
    
    return Variable(torch.LongTensor(batch_captions_ix)).cuda()
    

In [26]:
list(generate_batch(sequences,3))

[tensor([  883,   515,  1138,    13,  1175,  1448,  1385,  1371,    14,
          1400,   611,  1135,     3,   432,   686,    71,    14,    80,
           429,  1130,    13,  1126,    14,  1400,   611,     3,  1176,
           422,   602,   947], device='cuda:0'),
 tensor([  350,    80,  1357,   328,   481,  1294,     3,    80,   423,
          1299,  1368,  1153,   682,   360,  1269,   250,  1164,   658,
            13,   923,    13,  1111,  1289,   352,  1164,   422,  1355,
             3,   451,    80], device='cuda:0'),
 tensor([   14,   327,   255,  1372,     3,  1466,    14,    14,    80,
            14,   319,  1029,   700,     3,  1370,   326,   372,   676,
           372,  1483,   700,    14,    14,   323,    14,     3,  1370,
          1484,   319,   726], device='cuda:0')]

### Main loop

Train on minibatches just as usual. Evaluate on val from time to time.

In [27]:
N_train = len(train_captions)
N_val= len(val_captions)
batch_size = 128  # adjust me
n_epochs = 100  # adjust me
n_batches_per_epoch = N_train//batch_size  # adjust me
n_validation_batches = N_val//batch_size   # how many batches are used for validation after each epoch

In [35]:
N_train = len(train_captions)
N_val= len(val_captions)
batch_size = 128  # adjust me
n_epochs = 100  # adjust me
n_batches_per_epoch = N_train//batch_size  # adjust me
n_validation_batches = N_val//batch_size   # how many batches are used for validation after each epoch

from tqdm import tqdm_notebook

for epoch in range(n_epochs):
    
    train_loss=0
    network.train(True)
    for _ in tqdm_notebook(range(n_batches_per_epoch)):
        # clear old gradients; do a backward pass to get new gradients; then train with opt        
        opt.zero_grad()
        loss_t = compute_loss(network, generate_batch(train_captions, batch_size))

        loss_t.backward()
        opt.step()

        
        train_loss += loss_t.data.cpu().numpy().flatten()[0]
        
    train_loss /= n_batches_per_epoch
    
    val_loss=0
    network.train(False)
    for _ in range(n_validation_batches):
        loss_t = compute_loss(network, generate_batch(val_captions, batch_size))
        val_loss += loss_t.data.cpu().numpy().flatten()[0]
    val_loss /= n_validation_batches
    
    print('\nEpoch: {}, train loss: {}, val loss: {}'.format(epoch, train_loss, val_loss))

print("Finished!")

HBox(children=(IntProgress(value=0, max=83), HTML(value='')))



Epoch: 0, train loss: 5.575155717780791, val loss: 4.9975600772433815


HBox(children=(IntProgress(value=0, max=83), HTML(value='')))



Epoch: 1, train loss: 4.598073160791972, val loss: 4.28401353624132


HBox(children=(IntProgress(value=0, max=83), HTML(value='')))



Epoch: 2, train loss: 3.9150280981178742, val loss: 3.628547774420844


HBox(children=(IntProgress(value=0, max=83), HTML(value='')))



Epoch: 3, train loss: 3.319160263222384, val loss: 3.0937283039093018


HBox(children=(IntProgress(value=0, max=83), HTML(value='')))



Epoch: 4, train loss: 2.7975434940981576, val loss: 2.6195134851667614


HBox(children=(IntProgress(value=0, max=83), HTML(value='')))



Epoch: 5, train loss: 2.3633670490908334, val loss: 2.245490868886312


HBox(children=(IntProgress(value=0, max=83), HTML(value='')))



Epoch: 6, train loss: 2.005252325391195, val loss: 1.9187861283620198


HBox(children=(IntProgress(value=0, max=83), HTML(value='')))



Epoch: 7, train loss: 1.7058123436318822, val loss: 1.6529371076160007


HBox(children=(IntProgress(value=0, max=83), HTML(value='')))



Epoch: 8, train loss: 1.4467661021703697, val loss: 1.392942984898885


HBox(children=(IntProgress(value=0, max=83), HTML(value='')))



Epoch: 9, train loss: 1.2318476395434643, val loss: 1.2035102446873982


HBox(children=(IntProgress(value=0, max=83), HTML(value='')))



Epoch: 10, train loss: 1.0439868748906147, val loss: 1.0377031962076824


HBox(children=(IntProgress(value=0, max=83), HTML(value='')))



Epoch: 11, train loss: 0.8854383230209351, val loss: 0.8841807378662957


HBox(children=(IntProgress(value=0, max=83), HTML(value='')))



Epoch: 12, train loss: 0.7578568314931479, val loss: 0.7835063205824958


HBox(children=(IntProgress(value=0, max=83), HTML(value='')))



Epoch: 13, train loss: 0.6573388131268053, val loss: 0.6750806437598335


HBox(children=(IntProgress(value=0, max=83), HTML(value='')))



Epoch: 14, train loss: 0.5713456028915314, val loss: 0.6072143250041537


HBox(children=(IntProgress(value=0, max=83), HTML(value='')))



Epoch: 15, train loss: 0.5041206353400127, val loss: 0.5392769045299954


HBox(children=(IntProgress(value=0, max=83), HTML(value='')))



Epoch: 16, train loss: 0.4498921841023916, val loss: 0.49171894126468235


HBox(children=(IntProgress(value=0, max=83), HTML(value='')))



Epoch: 17, train loss: 0.40321885534079677, val loss: 0.45615735318925643


HBox(children=(IntProgress(value=0, max=83), HTML(value='')))



Epoch: 18, train loss: 0.3641209236110549, val loss: 0.42626358403099907


HBox(children=(IntProgress(value=0, max=83), HTML(value='')))



Epoch: 19, train loss: 0.338126566036638, val loss: 0.3982640736632877


HBox(children=(IntProgress(value=0, max=83), HTML(value='')))



Epoch: 20, train loss: 0.3133104303515101, val loss: 0.37729722261428833


HBox(children=(IntProgress(value=0, max=83), HTML(value='')))



Epoch: 21, train loss: 0.29159904495779293, val loss: 0.36072802874777055


HBox(children=(IntProgress(value=0, max=83), HTML(value='')))



Epoch: 22, train loss: 0.27454146383756617, val loss: 0.3497927188873291


HBox(children=(IntProgress(value=0, max=83), HTML(value='')))



Epoch: 23, train loss: 0.2607094473149403, val loss: 0.340895887878206


HBox(children=(IntProgress(value=0, max=83), HTML(value='')))



Epoch: 24, train loss: 0.247806845659233, val loss: 0.3235565192169613


HBox(children=(IntProgress(value=0, max=83), HTML(value='')))



Epoch: 25, train loss: 0.23881145611584906, val loss: 0.3158655928240882


HBox(children=(IntProgress(value=0, max=83), HTML(value='')))



Epoch: 26, train loss: 0.22680078249379812, val loss: 0.31391803092426723


HBox(children=(IntProgress(value=0, max=83), HTML(value='')))



Epoch: 27, train loss: 0.22067251226988183, val loss: 0.30342626240518356


HBox(children=(IntProgress(value=0, max=83), HTML(value='')))



Epoch: 28, train loss: 0.2124744079199182, val loss: 0.29177021318011814


HBox(children=(IntProgress(value=0, max=83), HTML(value='')))



Epoch: 29, train loss: 0.2072596122701484, val loss: 0.29502227240138584


HBox(children=(IntProgress(value=0, max=83), HTML(value='')))



Epoch: 30, train loss: 0.2026153366608792, val loss: 0.29182083076900905


HBox(children=(IntProgress(value=0, max=83), HTML(value='')))




KeyboardInterrupt: 

I have just tried several times and make estimation while stopping the training and could say that 20 epochs is more than enough

### Generate caption


In [31]:
def generate_caption(caption_prefix = ("а",), 
                     t=1, sample=True, max_len=100):
    
    caption_prefix = list(caption_prefix)
    
    for _ in range(max_len):
        
        prefix_ix = as_matrix([caption_prefix])
        prefix_ix = Variable(torch.LongTensor(prefix_ix.astype(int)), volatile=True).cuda()
        next_word_logits = network.forward(prefix_ix)[0, -1]
        next_word_probs = F.softmax(next_word_logits).data.cpu().numpy()
        
        
        assert len(next_word_probs.shape) ==1, 'probs must be one-dimensional'
        next_word_probs = next_word_probs ** t / np.sum(next_word_probs ** t) # apply temperature

        if sample:
            next_word = np.random.choice(vocabulary_inv, p=next_word_probs) 
        else:
            next_word = vocabulary_inv[np.argmax(next_word_probs)]

        caption_prefix.append(next_word)

        if next_word=="eos":
            break
            
    return caption_prefix

In [43]:
generate_caption()

  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()


['а',
 'также',
 'благодаря',
 'тому',
 ',',
 'что',
 'unknown',
 'есть',
 'в',
 'себе',
 'и',
 'для',
 'самосознания',
 'по',
 'отношению',
 'ко',
 'мне',
 'быть',
 'некоторым',
 'самостоятельным',
 'другим',
 ',',
 'чем-то',
 'рефлектированным',
 'в',
 'самое',
 'себя',
 ',',
 'как',
 'другое',
 'для',
 'другого',
 'только',
 'как',
 'сущее',
 ',',
 'и',
 'другое',
 'eos']

In [47]:
generate_caption(t=2)

  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()


['а',
 'абсолютная',
 'деятельность',
 ';',
 'и',
 'оно',
 'действительно',
 'снимает',
 'его',
 ',',
 'поскольку',
 'unknown',
 'предметом',
 ',',
 'только',
 'unknown',
 'себя',
 'за',
 'самостоятельный',
 ',',
 '-',
 'unknown',
 'себя',
 ',',
 'unknown',
 'его',
 ',',
 'и',
 'сохраняет',
 'себя',
 'в',
 'этом',
 'процессе',
 ',',
 'так',
 'как',
 'оно',
 '-',
 'самоцель',
 'eos']

In [1]:
import spacy
spacy_xx = spacy.load('xx_ent_wiki_sm')

In [2]:
def create_wordlist(doc):
    wl = []
    for word in doc:
        if word.text not in ("\n","\n\n",'\u2009','\xa0'):
            wl.append(word.text.lower())
    return wl

In [3]:
import re

def create_wl(filename):
    with open(filename, 'r', encoding="utf-8") as f:
        data = f.read()
    data = data.replace('\n', ' ').replace('a', 'а').strip()
    sents = data.split('.') #this is quite silly tokenizationm but spacy could not provide accurate enough tokenization for Russian, though
    data = " EOS ".join(sents) #["START " + s for s in sents]
    data = re.sub(' +',' ', data) # just to handle this naughty spaces
    doc = spacy_xx(data)
    wl = create_wordlist(doc)
    
    return wl

In [13]:
import collections
N_most_common = 2000

txt_paths = ['revisor.txt', 'fenomen.txt']
texts_wl = []
voc_set = set()

for filename in txt_paths:
    wl = create_wl(filename)
    texts_wl.append(wl)
    word_counts = collections.Counter(wl)
    wc_most_common = word_counts.most_common(n=N_most_common)
    voc_set |= set([x[0] for x in wc_most_common])

In [16]:
# Mapping from index to word : that's the vocabulary
words = list(voc_set)
vocabulary_inv = ["unknown"] + words
vocabulary_inv = list(sorted(vocabulary_inv))

# Mapping from word to index
vocab = {x: i for i, x in enumerate(vocabulary_inv)}

#size of the vocabulary
vocab_size = len(words)
print("vocab size: ", vocab_size)

vocab size:  3669


In [19]:
seq_length = 50
sequences = []
for wl in texts_wl:
    for i in range(0, len(wl) - seq_length):
        sequences.append(wl[i: i + seq_length])

In [88]:
n_tokens = vocab_size + 1
network = CaptionNet(n_tokens)
network.cuda()
opt = torch.optim.Adam(network.parameters())

In [89]:
from sklearn.model_selection import train_test_split

sequences = np.asarray(sequences)

train_captions, val_captions = train_test_split(sequences, test_size=0.1, random_state=42)

In [90]:
train_captions[2]

array(['сознания', 'и', 'самосознания', 'содержит', 'в', 'себе', 'прежде',
       'всего', 'единичные', 'личности', 'как', 'светящиеся',
       'видимостью', 'друг', 'в', 'друге', 'eos', 'но', 'их', 'различие',
       'в', 'этом', 'тождестве', 'есть', 'совершенно', 'неопределенная',
       'разность', 'их', 'или', ',', 'скорее', ',', 'такое', 'различие',
       ',', 'которое', 'не', 'есть', 'различие', 'eos', 'их', 'истина',
       'есть', 'поэтому', 'в-себе-и-для-себя', 'сущая', 'всеобщность',
       'и', 'объективность', 'самосознания'], dtype='<U29')

In [91]:
N_train = len(train_captions)
N_val= len(val_captions)
batch_size = 128  # adjust me
n_epochs = 15  # adjust me
n_batches_per_epoch = N_train//batch_size  # adjust me
n_validation_batches = N_val//batch_size   # how many batches are used for validation after each epoch

from tqdm import tqdm_notebook

for epoch in range(n_epochs):
    
    train_loss=0
    network.train(True)
    for _ in tqdm_notebook(range(n_batches_per_epoch)):
        # clear old gradients; do a backward pass to get new gradients; then train with opt        
        opt.zero_grad()
        loss_t = compute_loss(network, generate_batch(train_captions, batch_size))

        loss_t.backward()
        opt.step()

        
        train_loss += loss_t.data.cpu().numpy().flatten()[0]
        
    train_loss /= n_batches_per_epoch
    
    val_loss=0
    network.train(False)
    for _ in range(n_validation_batches):
        loss_t = compute_loss(network, generate_batch(val_captions, batch_size))
        val_loss += loss_t.data.cpu().numpy().flatten()[0]
    val_loss /= n_validation_batches
    
    print('\nEpoch: {}, train loss: {}, val loss: {}'.format(epoch, train_loss, val_loss))

print("Finished!")

HBox(children=(IntProgress(value=0, max=286), HTML(value='')))



Epoch: 0, train loss: 5.1377797376859435, val loss: 4.337369272785802


HBox(children=(IntProgress(value=0, max=286), HTML(value='')))



Epoch: 1, train loss: 3.8412239709934153, val loss: 3.4446707002578245


HBox(children=(IntProgress(value=0, max=286), HTML(value='')))



Epoch: 2, train loss: 3.105330874036242, val loss: 2.8535905422702914


HBox(children=(IntProgress(value=0, max=286), HTML(value='')))



Epoch: 3, train loss: 2.5899738690236234, val loss: 2.4074318255147626


HBox(children=(IntProgress(value=0, max=286), HTML(value='')))



Epoch: 4, train loss: 2.1838743644994456, val loss: 2.0437165844825005


HBox(children=(IntProgress(value=0, max=286), HTML(value='')))



Epoch: 5, train loss: 1.8430378036899167, val loss: 1.7157285674925773


HBox(children=(IntProgress(value=0, max=286), HTML(value='')))



Epoch: 6, train loss: 1.54700153274136, val loss: 1.4511906062403033


HBox(children=(IntProgress(value=0, max=286), HTML(value='')))



Epoch: 7, train loss: 1.3049590145791328, val loss: 1.2235510387728292


HBox(children=(IntProgress(value=0, max=286), HTML(value='')))



Epoch: 8, train loss: 1.0972145118496635, val loss: 1.042678456152639


HBox(children=(IntProgress(value=0, max=286), HTML(value='')))



Epoch: 9, train loss: 0.9210569064517121, val loss: 0.8745724039693032


HBox(children=(IntProgress(value=0, max=286), HTML(value='')))



Epoch: 10, train loss: 0.7652992979213075, val loss: 0.7324948003215175


HBox(children=(IntProgress(value=0, max=286), HTML(value='')))



Epoch: 11, train loss: 0.6336869311916244, val loss: 0.6110076269795818


HBox(children=(IntProgress(value=0, max=286), HTML(value='')))



Epoch: 12, train loss: 0.5236102739622542, val loss: 0.517160314706064


HBox(children=(IntProgress(value=0, max=286), HTML(value='')))



Epoch: 13, train loss: 0.43589149676002825, val loss: 0.4356150079158045


HBox(children=(IntProgress(value=0, max=286), HTML(value='')))



Epoch: 14, train loss: 0.36631983299772225, val loss: 0.38079540864113837
Finished!


In [117]:
" ".join(generate_caption(["eos", ], t = .7))

  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()


'eos упомянутое рабское самосознание , все одушевленное и дух обнаружился в тождестве с самим собой и является разум внешний объект для других , предмет тождество , внешний совершенно справедливо для противоречия характер ; другое что для себя теперь является не " я " , погружается во внутреннее самости , государства , через снятие их таким unknown ; единичности , до бы только , то , чему из unknown ехать по весьма unknown делу ну unknown тогда unknown для себя " я определил как вам eos'

In [125]:
" ".join(generate_caption(["вам", ], t = .7))

  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()


'вам счастье прежде всего сути , просто и к идее природы что-нибудь завтраком eos'

In [150]:
" ".join(generate_caption(["друг", ], t = .7))

  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()


'друг , как его жена и " я " , постольку самосознание есть в то же время у себя как углубление , проникновение и внедрение в особенно который тождество - в его абсолютной истине не только с объектом , вот другое , а не unknown со мной на дружеской ноге eos'

Generally speaking, the reslts are quite interesting, but nevertheless not ideal, since this particular approach is just sampling from the general language model which describes two texts. But of course, if in general texts are of different style and even have very little words in common, then several initial words will define almost surely the rest part of sequence and also the whole style.

### Now what?

This model produces some texts but one might still strive to improve it. To this end, I will leave here some ideas. 
One approach will be based on the classificator: so, zero-step is to build a classfier which given a sequence will output whether it from Gogol or Hegel. Speaking of neural network architecture, I will try basic LSTM with softmax layer. Then I will find such input which minimizes squared difference between probabilities of authorship. This could be done using gradient descent, since we already know weights of the neural network.

In this basic version, I have replaced all rare words with 'unkown' which throws away a lot of information and reduces quality. I will try looking at https://arxiv.org/abs/1508.07909

Moreover, it seems to be right task to check whether 'attention is' really 'what we need'...
Just to start with https://arxiv.org/abs/1502.03044 and 'Attention Is All You Need' https://arxiv.org/pdf/1706.03762
Then getting the words with highest attention corresponding to each author, one might just randomly force during sampling either word corresponding to Gogol or Hegel. Then text in the result should look as mix of both styles.