In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import tensorflow as tf
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import json
from collections import defaultdict

from metrics.metrics import det_precision_at_one, precision_at_one
from loss_functions.loss_functions import full_pairwise_softmax
from train_utils.container import ModelContainer
from application.model import ApplicationModel
from batch_generator.dir import DirIterator
from text_sparsifiers.sparsifiers import make_sparsifiers
from application.numpy_storage import NumpyStorage
from application.applier import Applier
from application.context_preparer import ContextPreparer
from dataset.reddit import prepare_reddit, convert
from text_sparsifiers.tokenizers import sequential_tokenizer, filling, filling_words, \
    make_filling_character_ngram, words
from batch_generator.flavors import PLAIN_REPLY, PLAIN_TRAIN, PLAIN_TEST
from model.baseline import Baseline
from trainer.callbacks.reduce_lr_on_plateu import ReduceLROnPlateu
from trainer.callbacks.save_best import SaveBest
from batch_generator.batch_generator import func_batch_generator, BatchGenerator
from trainer.tensorboard_trainer import TensorBoardTrainer
from trainer.tester import Tester

%matplotlib inline

In [4]:
BATCH_SIZE = 50
CONTEXT_LEN = 3

CONFIG_DIR = "/data/reddit/models/baseline/"
TRAIN_DATA = "/data/reddit/f_4_train_filtered/"
VAL_DATA = "/data/reddit/f_4_val_filtered/"
TEST_DATA = "/data/reddit/f_4_test_filtered"

REDDIT_TF_WEIGHTS_FNAME = "reddit_tf_weights"

In [5]:
train_iterator = DirIterator.from_data_folder(TRAIN_DATA)
val_iterator = DirIterator.from_data_folder(VAL_DATA)

In [6]:
mc = ModelContainer(CONFIG_DIR, ckpt=REDDIT_TF_WEIGHTS_FNAME, dill=True)

######################
Тут генерятся спарсифаеры

In [8]:
filling_character_ngram = make_filling_character_ngram()
keys = ['context', 'reply']
vocabulary_size = {key: 2 for key in keys}
modes = {key: 'occurrences' for key in keys}

In [9]:
sparsifiers = make_sparsifiers(train_iterator, filling_character_ngram, 
                                     vocabulary_size=vocabulary_size, modes=modes)

In [10]:
mc.write('sparsifiers', sparsifiers)

######################

In [7]:
sparsifiers = mc.read('sparsifiers')

In [8]:
params = {'reply': {'vocabulary_size': sparsifiers['reply'].vocabulary_size,
                    'embedding_size': 100,
                    'hiddens': [100, 100]},
         'context': {'vocabulary_size': sparsifiers['context'].vocabulary_size,
                    'embedding_size': 100,
                    'hiddens': [200, 100, 100]},
         'batch_size': BATCH_SIZE,
         'context_len': CONTEXT_LEN}

mc.write('model_params', params)

In [9]:
# почистить все логи: трейна, валидации, теста

mc.clear_logs()

In [10]:
with tf.device("/device:GPU:0"):
    tf.reset_default_graph()

    model_box = Baseline(mc).make_model()
    
    loss = model_box.make_loss(full_pairwise_softmax)
    dpao = model_box.make_loss(det_precision_at_one)
    
    learning_rate = tf.Variable(0.001, trainable=False)
    rlr = ReduceLROnPlateu(learning_rate, factor=0.3, patience=100, min_lr=0.000001)
    tf.summary.scalar('learning_rate', learning_rate)
    
    sb = SaveBest(10)

    train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss)

In [11]:
train_gen, val_gen = [func_batch_generator(iterator, mc, PLAIN_TRAIN) 
                      for iterator in [train_iterator, val_iterator]]

In [12]:
lol_trainer = TensorBoardTrainer(mc, train_step, ('loss', loss), [('dpao', dpao)], model_box, 
                                 train_gen, val_gen, [rlr, sb], val_steps=50, epoch_steps=500)

```tensorboard --logdir /data/reddit/models/baseline```

In [None]:
lol_trainer.start()

In [14]:
lol_trainer.save()

### Здесь запуск на тесте + эмбеддинги в tensorboard

In [None]:
tester = Tester(mc, TEST_DATA, PLAIN_TEST, model_box, [dpao])

In [15]:
tester.get_score()

array([0.83943826], dtype=float32)

Для tSNE 5к точек -- самое оно, больше только если вы на PCA хотите посмотреть или ещё че.

In [16]:
tester.write_embeddings(5000)

```tensorboard --logdir /data/reddit_f_60_utils_lstm_2/testing```


### Здесь поболтать

In [17]:
tm = ApplicationModel(mc, model=Baseline(mc), flavor=PLAIN_REPLY)

INFO:tensorflow:Restoring parameters from /data/reddit/models/baseline/reddit_tf_weights_-1


In [19]:
vector_storage = NumpyStorage()
context_preparer = ContextPreparer(convert)

##################################################

Это запустить, если хотите сгенерировать вектора реплаев

In [20]:
# file_limit -- количество файлов, из которых брать реплаи

iterator = DirIterator.from_data_folder(TRAIN_DATA, file_limit=50) 
gen = BatchGenerator(iterator, mc, PLAIN_REPLY, infinite=False)

In [21]:
ap = Applier.from_gen(tm, vector_storage, context_preparer, gen)

1212it [00:13, 88.18it/s]


In [22]:
ap.to_pickle(mc)

##################################################

In [23]:
ap = Applier.from_pickle(tm, vector_storage, context_preparer, mc)

In [None]:
dialogue = ["Mama",
            "Here I am"]
message = None
while message != 'exit':
    message = input()
    dialogue.append(message)
    reply = ap.reply(dialogue[-3:])
    print(reply)
    dialogue.append(reply)