In [1]:
import warnings
warnings.filterwarnings("ignore")

import json
from os.path import join

from deepburtsev.core.pipelinemanager import PipelineManager
from deepburtsev.core.transformers import FasttextVectorizer, ResultsCollector
from deepburtsev.models.intent_classification.DCNN import DCNN
from deepburtsev.models.skmodels.linear_models import LinearRegression, LinearSVM, RandomForest
from deepburtsev.core.sktransformers import Tfidf
from deepburtsev.core.sktransformers import Count

Using TensorFlow backend.


### Объявляем путь к датасету и корневой репозиторий

In [2]:
root = '/home/mks/projects/DeepBurtsev/'
file_path = join(root, 'data', 'english', 'new_group', 'dataset.json')
with open(file_path, 'r') as f:
    dataset = json.load(f)
    f.close()

### Скачивае ембеддинг по ссылке

In [3]:
# from deepburtsev.core.utils import download_untar
# url = 'http://lnsigo.mipt.ru/export/embeddings/ft_native_300_ru_wiki_lenta_nltk_word_tokenize/ft_native_300_ru_wiki_lenta_nltk_word_tokenize.bin'
# download_path = join(root, 'downloads', 'embeddings')
# download_untar(url, download_path)

### Задём недефолтные значения операциям, если хотим.

In [4]:
fasttext = FasttextVectorizer(request_names=['train', 'valid', 'test'],
                              new_names=['train', 'valid', 'test'],
                              dimension=100,
                              model_path='./embeddings/wordpunct_tok_reddit_comments_2017_11_100.bin')

tfidf = Tfidf(request_names=['train', 'valid', 'test'], new_names=['train', 'valid', 'test'])
count = Count(request_names=['train', 'valid', 'test'], new_names=['train', 'valid', 'test'])

### Задаём структуру эксперимента при помощи списка операций

In [5]:
# create structure for pipeline manager
neural_struct = [fasttext, (DCNN, {'search': True, 'batch_size': 32, 'epochs': [3, 5, 8, 10, 12, 14, 16, 18, 20]}),
                 ResultsCollector]

linear_struct = [[tfidf, count],
                 [LinearRegression, LinearSVM, RandomForest],
                 ResultsCollector]

In [6]:
# Запуск перебора всех пайплайнов, параметр hyper_search отвечает за включение и выключение подбора гиперпараметров
# У нейронной модели, по дефолту стоит 20 эпох. Если включить подбор гиперпараметров,
# то считаться всё будет очень долго

neural_man = PipelineManager(dataset, neural_struct, 'skill_manager', target_metric='f1_macro')
neural_man.run()

linear_man = PipelineManager(dataset, linear_struct, 'skill_manager', target_metric='f1_macro')
linear_man.run()

100%|██████████| 599/599 [00:00<00:00, 5088.01it/s]
100%|██████████| 73/73 [00:00<00:00, 6261.44it/s]
100%|██████████| 73/73 [00:00<00:00, 7323.93it/s]

[ Experiment start ... ]
[ Starting vectorization ... ]
[ Vectorization of train part of dataset ... ]
[ Vectorization of valid part of dataset ... ]
[ Vectorization of test part of dataset ... ]
[ Vectorization was ended. ]

____Training over 599 samples____







Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
[ Initializing intent_model from scratch ]
Instructions for updating:
keep_dims is deprecated, use keepdims instead
train -->	updates: 1	fmeasure: 0.9409974813461304	 
epochs_done: 1
train -->	updates: 20	fmeasure: 4.428089141845703	 
epochs_done: 2
train -->	updates: 39	fmeasure: 3.6824333667755127	 
epochs_done: 3
train -->	updates: 58	fmeasure: 3.029947519302368	 
epochs_done: 4
train -->	updates: 77	fmeasure: 2.523181438446045	 
epochs_done: 5
train -->	updates: 96	fmeasure: 2.1780221462249756	 
epochs_done: 6
train -->	updates: 115	fmeasure: 1.937276005744934	 
epochs_done: 7
train -->	updates: 134	fmeasure: 1.8077319860458374	 
epochs_done: 8
train -->	updates: 153	fmeasure: 1.5487821102142334	 
epochs_done: 9
train -->	updates: 172	fmeasure: 1.3908882141

100%|██████████| 599/599 [00:00<00:00, 8061.16it/s]
100%|██████████| 73/73 [00:00<00:00, 6418.82it/s]
100%|██████████| 73/73 [00:00<00:00, 6831.42it/s]







[ Progress: pipe 2/9; Time left: 0:2:4; ]
[ Starting vectorization ... ]
[ Vectorization of train part of dataset ... ]
[ Vectorization of valid part of dataset ... ]
[ Vectorization of test part of dataset ... ]
[ Vectorization was ended. ]

____Training over 599 samples____







[ Initializing intent_model from scratch ]
train -->	updates: 1	fmeasure: 0.756723165512085	 
epochs_done: 1
train -->	updates: 20	fmeasure: 4.770534992218018	 
epochs_done: 2
train -->	updates: 39	fmeasure: 4.080240249633789	 
epochs_done: 3
train -->	updates: 58	fmeasure: 3.424823760986328	 
epochs_done: 4
train -->	updates: 77	fmeasure: 2.9522292613983154	 
epochs_done: 5
train -->	updates: 96	fmeasure: 2.514756202697754	 
epochs_done: 6
train -->	updates: 115	fmeasure: 2.272418737411499	 
epochs_done: 7
train -->	updates: 134	fmeasure: 2.036010503768921	 
epochs_done: 8
train -->	updates: 153	fmeasure: 1.8511836528778076	 
epochs_done: 9
train -->	updates: 172	fmeasure: 1.7262858152389526	 
epochs_done: 10
train -->	updates: 191	fmeasure: 1.5841009616851807	 
epochs_done: 11
train -->	updates: 210	fmeasure: 1.476325273513794	 
epochs_done: 12
train -->	updates: 229	fmeasure: 1.4038852453231812	 
epochs_done: 13
train -->	updates: 248	fmeasure: 1.2516309022903442	 
epochs_done: 14
t

100%|██████████| 599/599 [00:00<00:00, 8276.00it/s]
100%|██████████| 73/73 [00:00<00:00, 8072.77it/s]
100%|██████████| 73/73 [00:00<00:00, 7438.70it/s]







[ Progress: pipe 3/9; Time left: 0:1:49; ]
[ Starting vectorization ... ]
[ Vectorization of train part of dataset ... ]
[ Vectorization of valid part of dataset ... ]
[ Vectorization of test part of dataset ... ]
[ Vectorization was ended. ]

____Training over 599 samples____







[ Initializing intent_model from scratch ]
Operation with number 2;


KeyboardInterrupt: 

### Вывод результатов