# Обучение чат-бота на основе диалогов из фильмов


# 1 Импорт библиотек, установка, настройка

## 1.1 Импорт

In [1]:
from pathlib import Path
import codecs
import tensorflow as tf
import pandas as pd
import numpy as np
import os
from datetime import datetime, timedelta

tf.__version__

'2.6.4'

## 1.3 Гиперпараметры

In [2]:
HP_DATASET = {'seq_len'                : 100,
              'batch_size'             : 1024,  # 64
              'examples_per_epoch'     : None,
              'vocab_size'             : None,
             }

## 1.4 Пути и операции с файлами

In [3]:
path_dataset_folder = Path('../input/movie-dialog-corpus/')

path_data = Path('./data/')

path_models = Path('./models/')
path_backup = Path('./models/BackupAndRestore/')
path_chekpoint = Path('./models/Checkpoint/')

path_mlflow = Path('./mlruns/')
path_tblogs = Path('./tblogs/')
path_img = Path('./img/')


paths = [path_data, path_models, path_backup, path_chekpoint, path_mlflow, path_tblogs, path_img]

for path in paths:
    if os.path.exists(path):
        print(f"Folder {path} exists!")
    else:
        os.makedirs(path)
        print(f"Folder {path} was created.")

Folder data was created.
Folder models was created.
Folder models/BackupAndRestore was created.
Folder models/Checkpoint was created.
Folder mlruns was created.
Folder tblogs was created.
Folder img was created.


In [4]:
filename_conversations = 'movie_conversations.tsv' 
filename_lines = 'movie_lines.tsv' 

filename_model_ids_from_chars = 'ids_from_chars'
filename_model_chars_from_ids = 'chars_from_ids'


filename_input_target_ids = f"input_target_ids_len60.tfrecords"
filename_input_target_ids = f"input_target_ids_len200.tfrecords"

# 2 Подготовка текста к загрузке в Dataset

## 2.1 Фразы - `movie_lines.tsv`
* id фразы - `L194`
* id персонажа - `u0`
* id фильма - `m0`
* фраза `text`

In [5]:
def clean_tab_df(x):
    try:
        return x.replace('\t', ' ')
    except:
        return x

In [6]:
path = path_dataset_folder / filename_lines

line_list = []
with open(path, 'r', encoding='utf-8', errors="ignore", newline='\n') as text:
    lines = text.read()
    # в некоторых диалогах используется данный код (новая линия - u'\x85'), он расположен внутри предложений.
    # возможно, часть текстов были в cp1252 в них данный код означает отточие "..."
    replace_list = [u'\x85', u'\x85', u'\x82', u'\x8a', u'\x8c', u'\x91', 
                    u'\x92', u'\x93', u'\x94', u'\x96', u'\x97', u'\xad']
    
    for char in replace_list:
        lines = lines.replace(char, '')
    
    lines = lines.splitlines()
    for line in lines:
#         часть строчек содержит кавычки в начале и конце (kaggle)
        line = line.lstrip('"').rstrip('"')
        line = line.split('\t', maxsplit=4)
        line_list.append(line)

df_frase = pd.DataFrame(line_list, columns=['frase_id', 'char_id', 'movie_id', 'char_name', 'frase_txt'])
df_frase['frase_txt'] = df_frase['frase_txt'].map(clean_tab_df)

df_frase = df_frase.drop(columns=['char_id','movie_id','char_name'])

df_frase.head()

Unnamed: 0,frase_id,frase_txt
0,L1045,They do not!
1,L1044,They do to!
2,L985,I hope so.
3,L984,She okay?
4,L925,Let's go.


## 2.2 Диалоги - `movie_conversations.tsv`
* id первого участника в диалоге - `u0`
* id второго участника в диалоге - `u1`
* id фильма - `m0`
* Фразы из диалога в хранологическом порядке - `[L194, L195, L196, L197]`

In [7]:
def dealog_split(x):
    x = x[1:]
    x = x[:-1]
    x = x.replace("'", "")
    x = x.split(' ')
    return x

In [8]:
path = path_dataset_folder / filename_conversations

dialog_list = []
with open(path, 'r', encoding='utf8') as text:
    dialogs = text.read().splitlines()
    for dialog in dialogs:
        dialog = dialog.split('\t',)
        dialog_list.append(dialog)
        
df_dailogs = pd.DataFrame(dialog_list, columns=['char_id1', 'char_id2', 'movie_id2', 'frase_ids'])
df_dailogs['frase_ids'] = df_dailogs['frase_ids'].map(dealog_split)
df_dailogs.tail()
df_dailogs.head()

Unnamed: 0,char_id1,char_id2,movie_id2,frase_ids
0,u0,u2,m0,"[L194, L195, L196, L197]"
1,u0,u2,m0,"[L198, L199]"
2,u0,u2,m0,"[L200, L201, L202, L203]"
3,u0,u2,m0,"[L204, L205, L206]"
4,u0,u2,m0,"[L207, L208]"


## 2.3 Образцы для обучения: `input_texts` и `target_texts`
* Что говорит персонаж `input_texts`
* Что ему отвечают`target_texts`

In [9]:
# Составляем пары фраз
text_pairs = []

for dialog in df_dailogs['frase_ids']:
    cell_len = len(dialog)
    j, i = 0, 2
    while i <= cell_len:
        text_pairs.append(dialog[j:i])
        i += 1
        j += 1

df_pairs = pd.DataFrame(text_pairs, columns=['input_texts', 'target_texts',])
df_pairs.head()

Unnamed: 0,input_texts,target_texts
0,L194,L195
1,L195,L196
2,L196,L197
3,L198,L199
4,L200,L201


In [10]:
def string_split(x, tags=False):
    # функция для разделения строки на элементы
    # в начале и конце добавляет теги
    if tags:
        splited_string = ['<START>']
        splited_string.extend([*x])
        splited_string.extend(['<END>'])
        return splited_string
        
    return [*x]

In [11]:
# id -> text - lookup
df_input = df_pairs.merge(right = df_frase, 
                          left_on='input_texts',
                          right_on='frase_id',
                          how='left')

input_target = df_input.merge(right = df_frase, 
                           left_on='target_texts',
                           right_on='frase_id',
                           how='left')

input_target = input_target.drop(columns=['input_texts', 'target_texts', 'frase_id_x', 'frase_id_y'])
input_target = input_target.rename(columns={'frase_txt_x': 'input_txt', 'frase_txt_y': 'target_txt'})

input_target['input_txt_split']  = input_target['input_txt'].map(string_split)
input_target['target_txt_split'] = input_target['target_txt'].map(lambda x: string_split(x, True))

input_target.head(3)

Unnamed: 0,input_txt,target_txt,input_txt_split,target_txt_split
0,Can we make this quick? Roxanne Korrine and A...,Well I thought we'd start with pronunciation i...,"[C, a, n, , w, e, , m, a, k, e, , t, h, i, ...","[<START>, W, e, l, l, , I, , t, h, o, u, g, ..."
1,Well I thought we'd start with pronunciation i...,Not the hacking and gagging and spitting part....,"[W, e, l, l, , I, , t, h, o, u, g, h, t, , ...","[<START>, N, o, t, , t, h, e, , h, a, c, k, ..."
2,Not the hacking and gagging and spitting part....,Okay... then how 'bout we try out some French ...,"[N, o, t, , t, h, e, , h, a, c, k, i, n, g, ...","[<START>, O, k, a, y, ., ., ., , t, h, e, n, ..."


In [12]:
size_1 = input_target.shape[0]

print('До удаления пустых строчек:', size_1)
# Часть диалогов нет первой фразы \ ответной фразы
input_target = input_target.drop(index=input_target[input_target['input_txt'] == ''].index)
input_target = input_target.drop(index=input_target[input_target['target_txt'] == ''].index)

size_2 = input_target.shape[0]
print('После удаления пустых строчек:', size_2, 'Разница:', size_1 - size_2)

input_target.head(3)

До удаления пустых строчек: 221616
После удаления пустых строчек: 221293 Разница: 323


Unnamed: 0,input_txt,target_txt,input_txt_split,target_txt_split
0,Can we make this quick? Roxanne Korrine and A...,Well I thought we'd start with pronunciation i...,"[C, a, n, , w, e, , m, a, k, e, , t, h, i, ...","[<START>, W, e, l, l, , I, , t, h, o, u, g, ..."
1,Well I thought we'd start with pronunciation i...,Not the hacking and gagging and spitting part....,"[W, e, l, l, , I, , t, h, o, u, g, h, t, , ...","[<START>, N, o, t, , t, h, e, , h, a, c, k, ..."
2,Not the hacking and gagging and spitting part....,Okay... then how 'bout we try out some French ...,"[N, o, t, , t, h, e, , h, a, c, k, i, n, g, ...","[<START>, O, k, a, y, ., ., ., , t, h, e, n, ..."


In [13]:
# Фильтрация - три сигмы
len_input_txt = []
for i in input_target['input_txt']:
    len_input_txt.append(len(i))

input_limit = int(np.std(len_input_txt) * 3)

len_target_txt = []
for i in input_target['target_txt']:
    len_input_txt.append(len(i))

target_limit = int(np.std(len_input_txt) * 3)

print(input_limit, target_limit)

180 187


In [14]:
# отфильтруем значения установив лимит в 200
# для таргета -1 т.к. добавляем теги
size_1 = input_target.shape[0]
print('До удаления слишком длинных диалогов:', size_1)
input_target = input_target.drop(input_target[input_target['input_txt'].map(len) > 200].index) #input_limit   #60
input_target = input_target.drop(input_target[input_target['target_txt'].map(len) > 200-1].index) #target_limit #60
size_2 = input_target.shape[0]
print('После удаления строчек:', size_2, 'Разница:', size_1 - size_2)

# Сбросить индексацию
input_target = input_target.reset_index(drop=True)
input_target.head(3)

До удаления слишком длинных диалогов: 221293
После удаления строчек: 207858 Разница: 13435


Unnamed: 0,input_txt,target_txt,input_txt_split,target_txt_split
0,Can we make this quick? Roxanne Korrine and A...,Well I thought we'd start with pronunciation i...,"[C, a, n, , w, e, , m, a, k, e, , t, h, i, ...","[<START>, W, e, l, l, , I, , t, h, o, u, g, ..."
1,Well I thought we'd start with pronunciation i...,Not the hacking and gagging and spitting part....,"[W, e, l, l, , I, , t, h, o, u, g, h, t, , ...","[<START>, N, o, t, , t, h, e, , h, a, c, k, ..."
2,Not the hacking and gagging and spitting part....,Okay... then how 'bout we try out some French ...,"[N, o, t, , t, h, e, , h, a, c, k, i, n, g, ...","[<START>, O, k, a, y, ., ., ., , t, h, e, n, ..."


## 2.4 Кодируем данные и создаём dataset

### 2.4.1 Кодировщик и пример работы

In [15]:
# Общий список - весь текст
df_text = ' '.join(df_frase['frase_txt'].tolist())

# Перечень символов
vocab = sorted(set(df_text))
vocab.append('<START>')
vocab.append('<END>')

# Отличается от TextVectorization тем, что нет возможности очищать данные и разделять данные.
# ⚠️Кодирование Символов в Индексы
ids_from_chars = tf.keras.layers.StringLookup(vocabulary=list(vocab), mask_token=None)
# ⚠️Кодирование Индексов в символы
chars_from_ids = tf.keras.layers.StringLookup(vocabulary=list(vocab), mask_token=None, invert=True)

2022-08-11 20:47:17.137836: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [16]:
def string_join(ids):
  return tf.strings.reduce_join(ids, axis=-1)

In [17]:
# Пример работы созданного слоя
print('Словарь:', ids_from_chars.get_vocabulary()[:10])
print('Длина словаря:', len(ids_from_chars.get_vocabulary()))

# Сохранение гиперпараметра
HP_DATASET['vocab_size'] = len(ids_from_chars.get_vocabulary())

# Кодирование строчки
encoded_example = input_target['target_txt_split'][0]
# ⚠️ Вначале разделение на элементы
encoded_example = string_split(encoded_example)
encoded_example = ids_from_chars(encoded_example)
print('Кодирование строчки:', encoded_example)

# Декодирование строчки
decoded_example = chars_from_ids(encoded_example)
# ⚠️ В конце сборка в строчку
decoded_example = string_join(decoded_example)
print('Декодированая строчка:\n', decoded_example)

Словарь: ['[UNK]', ' ', '!', '"', '#', '$', '%', '&', "'", ')']
Длина словаря: 127
Кодирование строчки: tf.Tensor(
[125  54  67  74  74   1  40   1  82  70  77  83  69  70  82   1  85  67
   8  66   1  81  82  63  80  82   1  85  71  82  70   1  78  80  77  76
  83  76  65  71  63  82  71  77  76   1  71  68   1  82  70  63  82   8
  81   1  77  73  63  87   1  85  71  82  70   1  87  77  83  14 126], shape=(71,), dtype=int64)
Декодированая строчка:
 tf.Tensor(b"<START>Well I thought we'd start with pronunciation if that's okay with you.<END>", shape=(), dtype=string)


### 2.4.2 Сохранение кодироващиков

In [18]:
# Создание модели из одного слоя (кодировщик)
model = tf.keras.models.Sequential()
model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
model.add(ids_from_chars)
model.summary()

# Сохранение модели
path = str(path_models / filename_model_ids_from_chars)
model.save(path, save_format="tf")

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
string_lookup (StringLookup) (None, 1)                 0         
Total params: 0
Trainable params: 0
Non-trainable params: 0
_________________________________________________________________


2022-08-11 20:47:17.552983: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


In [19]:
# Создание модели из одного слоя (кодировщик)
model = tf.keras.models.Sequential()
model.add(tf.keras.Input(shape=(1,), dtype=tf.int32))
model.add(chars_from_ids)
model.summary()

# Сохранение модели
path = str(path_models / filename_model_chars_from_ids)
model.save(path, save_format="tf")

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
string_lookup_1 (StringLooku (None, 1)                 0         
Total params: 0
Trainable params: 0
Non-trainable params: 0
_________________________________________________________________


### 2.4.3 Подготовка последовательностей

In [20]:
input_target.head()

Unnamed: 0,input_txt,target_txt,input_txt_split,target_txt_split
0,Can we make this quick? Roxanne Korrine and A...,Well I thought we'd start with pronunciation i...,"[C, a, n, , w, e, , m, a, k, e, , t, h, i, ...","[<START>, W, e, l, l, , I, , t, h, o, u, g, ..."
1,Well I thought we'd start with pronunciation i...,Not the hacking and gagging and spitting part....,"[W, e, l, l, , I, , t, h, o, u, g, h, t, , ...","[<START>, N, o, t, , t, h, e, , h, a, c, k, ..."
2,Not the hacking and gagging and spitting part....,Okay... then how 'bout we try out some French ...,"[N, o, t, , t, h, e, , h, a, c, k, i, n, g, ...","[<START>, O, k, a, y, ., ., ., , t, h, e, n, ..."
3,You're asking me out. That's so cute. What's ...,Forget it.,"[Y, o, u, ', r, e, , a, s, k, i, n, g, , m, ...","[<START>, F, o, r, g, e, t, , i, t, ., <END>]"
4,No no it's my fault -- we didn't have a proper...,Cameron.,"[N, o, , n, o, , i, t, ', s, , m, y, , f, ...","[<START>, C, a, m, e, r, o, n, ., <END>]"


In [21]:
# Получаем вход для энкодера: Кодируем столбец (str) в int
%time input_target['encoder_input_seq']  = input_target['input_txt_split'].map(ids_from_chars)

# Получаем вход и выход для декодера: Кодируем столбец (str) в int
%time input_target['decoder_input']  = input_target['target_txt_split'].map(ids_from_chars)
# Разделяем на два набора, со смещением: в первом <START> (125); во вотором <END>(id=126)
%time input_target['decoder_input_seq']  = input_target['decoder_input'].map(lambda x: x[:-1])
%time input_target['decoder_target_seq'] = input_target['decoder_input'].map(lambda x: x[1:])

input_target.head()

CPU times: user 1min 31s, sys: 393 ms, total: 1min 31s
Wall time: 1min 31s
CPU times: user 1min 30s, sys: 430 ms, total: 1min 31s
Wall time: 1min 31s
CPU times: user 26.4 s, sys: 281 ms, total: 26.7 s
Wall time: 26.7 s
CPU times: user 25.8 s, sys: 250 ms, total: 26 s
Wall time: 26 s


Unnamed: 0,input_txt,target_txt,input_txt_split,target_txt_split,encoder_input_seq,decoder_input,decoder_input_seq,decoder_target_seq
0,Can we make this quick? Roxanne Korrine and A...,Well I thought we'd start with pronunciation i...,"[C, a, n, , w, e, , m, a, k, e, , t, h, i, ...","[<START>, W, e, l, l, , I, , t, h, o, u, g, ...","(tf.Tensor(34, shape=(), dtype=int64), tf.Tens...","(tf.Tensor(125, shape=(), dtype=int64), tf.Ten...","(tf.Tensor(125, shape=(), dtype=int64), tf.Ten...","(tf.Tensor(54, shape=(), dtype=int64), tf.Tens..."
1,Well I thought we'd start with pronunciation i...,Not the hacking and gagging and spitting part....,"[W, e, l, l, , I, , t, h, o, u, g, h, t, , ...","[<START>, N, o, t, , t, h, e, , h, a, c, k, ...","(tf.Tensor(54, shape=(), dtype=int64), tf.Tens...","(tf.Tensor(125, shape=(), dtype=int64), tf.Ten...","(tf.Tensor(125, shape=(), dtype=int64), tf.Ten...","(tf.Tensor(45, shape=(), dtype=int64), tf.Tens..."
2,Not the hacking and gagging and spitting part....,Okay... then how 'bout we try out some French ...,"[N, o, t, , t, h, e, , h, a, c, k, i, n, g, ...","[<START>, O, k, a, y, ., ., ., , t, h, e, n, ...","(tf.Tensor(45, shape=(), dtype=int64), tf.Tens...","(tf.Tensor(125, shape=(), dtype=int64), tf.Ten...","(tf.Tensor(125, shape=(), dtype=int64), tf.Ten...","(tf.Tensor(46, shape=(), dtype=int64), tf.Tens..."
3,You're asking me out. That's so cute. What's ...,Forget it.,"[Y, o, u, ', r, e, , a, s, k, i, n, g, , m, ...","[<START>, F, o, r, g, e, t, , i, t, ., <END>]","(tf.Tensor(56, shape=(), dtype=int64), tf.Tens...","(tf.Tensor(125, shape=(), dtype=int64), tf.Ten...","(tf.Tensor(125, shape=(), dtype=int64), tf.Ten...","(tf.Tensor(37, shape=(), dtype=int64), tf.Tens..."
4,No no it's my fault -- we didn't have a proper...,Cameron.,"[N, o, , n, o, , i, t, ', s, , m, y, , f, ...","[<START>, C, a, m, e, r, o, n, ., <END>]","(tf.Tensor(45, shape=(), dtype=int64), tf.Tens...","(tf.Tensor(125, shape=(), dtype=int64), tf.Ten...","(tf.Tensor(125, shape=(), dtype=int64), tf.Ten...","(tf.Tensor(34, shape=(), dtype=int64), tf.Tens..."


In [22]:
input_target['decoder_input_seq'].map(len).max()

200

In [23]:
# Добавляем паддинги, без maxlen padding input = 180, target = 188, ⚠️было 190, изменил на 60 maxlen=200, 
# tf.keras.utils.pad_sequences() # 2.9
encoder_input_seq = input_target['encoder_input_seq']
%time encoder_input_seq = tf.keras.preprocessing.sequence.pad_sequences(encoder_input_seq, padding="post")

decoder_input_seq = input_target['decoder_input_seq']
%time decoder_input_seq = tf.keras.preprocessing.sequence.pad_sequences(decoder_input_seq, padding="post")

decoder_target_seq = input_target['decoder_target_seq']
%time decoder_target_seq = tf.keras.preprocessing.sequence.pad_sequences(decoder_target_seq, padding="post")

CPU times: user 31.2 s, sys: 88.7 ms, total: 31.3 s
Wall time: 31.3 s
CPU times: user 29.8 s, sys: 43.9 ms, total: 29.9 s
Wall time: 29.9 s
CPU times: user 31.1 s, sys: 103 ms, total: 31.2 s
Wall time: 31.2 s


In [24]:
dataset = tf.data.Dataset.from_tensor_slices((encoder_input_seq,  decoder_input_seq, decoder_target_seq))

for i in dataset.take(1):
    print(i)

(<tf.Tensor: shape=(200,), dtype=int32, numpy=
array([34, 63, 76,  1, 85, 67,  1, 75, 63, 73, 67,  1, 82, 70, 71, 81,  1,
       79, 83, 71, 65, 73, 31,  1,  1, 49, 77, 86, 63, 76, 76, 67,  1, 42,
       77, 80, 80, 71, 76, 67,  1, 63, 76, 66,  1, 32, 76, 66, 80, 67, 85,
        1, 33, 63, 80, 80, 67, 82, 82,  1, 63, 80, 67,  1, 70, 63, 84, 71,
       76, 69,  1, 63, 76,  1, 71, 76, 65, 80, 67, 66, 71, 64, 74, 87,  1,
       70, 77, 80, 80, 67, 76, 66, 77, 83, 81,  1, 78, 83, 64, 74, 71, 65,
        1, 64, 80, 67, 63, 73, 13,  1, 83, 78,  1, 77, 76,  1, 82, 70, 67,
        1, 79, 83, 63, 66, 14,  1,  1, 32, 69, 63, 71, 76, 14,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0], dtype=int32)>, <tf.Tensor: shape=(200,), dtype=int32, numpy=
array([

## 2.5 Сохраняем Dataset -> TFRecord

In [25]:
def bytes_feature(seq):
  if isinstance(seq, type(tf.constant(0))):
    seq = seq.numpy()
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[seq]))

def create_record(encode_input_seqs, decode_input_seqs, decode_target_seqs):
    feature = {
        'encode_input_seqs'  : encode_input_seqs,
        'decode_input_seqs'  : decode_input_seqs,
        'decode_target_seqs' : decode_target_seqs,
      }
    proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return proto.SerializeToString()

In [26]:
%%time
# Сохраняем на диск
path = str(path_data / filename_input_target_ids)

with tf.io.TFRecordWriter(path) as writer:
    num = 0
    for sample in dataset:
        if num % 20_000 == 0:
            print(num)
        num += 1
        # ⚠️ Сериализация, нельзя записать список чисел 
        enI = tf.io.serialize_tensor(sample[0])
        enI = bytes_feature(enI)

        deI = tf.io.serialize_tensor(sample[1])
        deI = bytes_feature(deI)

        deT = tf.io.serialize_tensor(sample[2])
        deT = bytes_feature(deT)

        pr = create_record(enI, deI, deT)
        writer.write(pr)

    print(f'dataset сохранён: {path}')

0
20000
40000
60000
80000
100000
120000
140000
160000
180000
200000
dataset сохранён: data/input_target_ids_len200.tfrecords
CPU times: user 1min 15s, sys: 2.46 s, total: 1min 17s
Wall time: 1min 17s


In [27]:
# Функция для чтения TFRecords
feature_description = {
        'encode_input_seqs'  : tf.io.FixedLenFeature([], tf.string, default_value=''),
        'decode_input_seqs'  : tf.io.FixedLenFeature([], tf.string, default_value=''),
        'decode_target_seqs' : tf.io.FixedLenFeature([], tf.string, default_value=''),
          }

def parse_function(example_proto):
  parsed = tf.io.parse_single_example(example_proto, feature_description)
  return (parsed['encode_input_seqs'], parsed['decode_input_seqs'], parsed['decode_target_seqs'])

In [28]:
ls -all ./data

total 517424
drwxr-xr-x 2 root root      4096 Aug 11 20:52 [0m[01;34m.[0m/
drwxr-xr-x 7 root root      4096 Aug 11 20:47 [01;34m..[0m/
-rw-r--r-- 1 root root 529830042 Aug 11 20:54 input_target_ids_len200.tfrecords


In [29]:
# Проверка записанных данных
path = str(path_data / filename_input_target_ids)

dataset = tf.data.TFRecordDataset(filenames = [path])
dataset = dataset.map(parse_function)
dataset = dataset.map(lambda x, y, z: [tf.io.parse_tensor(x, out_type=tf.int32), 
                                       tf.io.parse_tensor(y, out_type=tf.int32),
                                       tf.io.parse_tensor(z, out_type=tf.int32)])

len_tfr = []
for i in dataset:
    len_tfr.append(i)
print('Набор объёмом:', len(len_tfr))

for i in dataset.take(1):
    print(i)

2022-08-11 20:54:08.594003: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Набор объёмом: 207858
(<tf.Tensor: shape=(200,), dtype=int32, numpy=
array([34, 63, 76,  1, 85, 67,  1, 75, 63, 73, 67,  1, 82, 70, 71, 81,  1,
       79, 83, 71, 65, 73, 31,  1,  1, 49, 77, 86, 63, 76, 76, 67,  1, 42,
       77, 80, 80, 71, 76, 67,  1, 63, 76, 66,  1, 32, 76, 66, 80, 67, 85,
        1, 33, 63, 80, 80, 67, 82, 82,  1, 63, 80, 67,  1, 70, 63, 84, 71,
       76, 69,  1, 63, 76,  1, 71, 76, 65, 80, 67, 66, 71, 64, 74, 87,  1,
       70, 77, 80, 80, 67, 76, 66, 77, 83, 81,  1, 78, 83, 64, 74, 71, 65,
        1, 64, 80, 67, 63, 73, 13,  1, 83, 78,  1, 77, 76,  1, 82, 70, 67,
        1, 79, 83, 63, 66, 14,  1,  1, 32, 69, 63, 71, 76, 14,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0], dtype=int32)>, <tf.Tensor: shape=(200,), dtype

## 3 Сохранение результатов

In [30]:
import shutil

shutil.make_archive(base_name='./data', format='zip', base_dir='./data')
shutil.make_archive(base_name='./models', format='zip', base_dir='./models')

'./models.zip'

In [31]:
!ls -all

total 17008
drwxr-xr-x 7 root root     4096 Aug 11 20:54 .
drwxr-xr-x 6 root root     4096 Aug 11 20:46 ..
---------- 1 root root    85954 Aug 11 20:54 __notebook__.ipynb
drwxr-xr-x 2 root root     4096 Aug 11 20:52 data
-rw-r--r-- 1 root root 17282536 Aug 11 20:54 data.zip
drwxr-xr-x 2 root root     4096 Aug 11 20:47 img
drwxr-xr-x 2 root root     4096 Aug 11 20:47 mlruns
drwxr-xr-x 6 root root     4096 Aug 11 20:47 models
-rw-r--r-- 1 root root    14837 Aug 11 20:54 models.zip
drwxr-xr-x 2 root root     4096 Aug 11 20:47 tblogs
