In [1]:
import json
import re
import pandas as pd
import nltk
import pymorphy2
import fasttext
import numpy as np
from tqdm import tqdm

from utils import labels2onehot_one
from deeppavlov.core.commands.infer import build_model_from_config
from dataset import Dataset
from transformer import Speller, Tokenizer, Lemmatizer, FasttextVectorizer
morph = pymorphy2.MorphAnalyzer()

# linear models
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

# sklearn feachure extractors
from sklearn.feature_extraction.text import CountVectorizer as count
from sklearn.feature_extraction.text import TfidfVectorizer as tfidf
from sklearn.preprocessing import LabelEncoder

import random
from typing import Generator
from sklearn.model_selection import train_test_split

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)
2018-03-22 22:31:18.863 INFO in 'pymorphy2.opencorpora_dict.wrapper'['wrapper'] at line 16: Loading dictionaries from /home/mks/envs/intent_script/lib/python3.6/site-packages/pymorphy2_dicts/data
2018-03-22 22:31:18.910 INFO in 'pymorphy2.opencorpora_dict.wrapper'['wrapper'] at line 20: format: 2.4, revision: 393442, updated: 2015-01-17T16:03:56.586168
2018-03-22 22:31:19.4 DEBUG in 'matplotlib.backends'['__init__'] at line 90: backend module://ipykernel.pylab.backend_inline version unknown


## Dataset

In [2]:
def read_dataset(filepath, duplicates=False, clean=True):
    file = open(filepath, 'r', encoding='ISO-8859-1')
    data = pd.read_csv(file)

    old_names = data.keys()
    names = [n.encode('ISO-8859-1').decode('cp1251').encode('utf8') for n in old_names]
    names = [n.decode('utf-8') for n in names]

    new_data = dict()
    for old, new in zip(old_names, names):
        new_data[new] = list()
        for c in data[old]:
            try:
                s = c.encode('ISO-8859-1').decode('cp1251').encode('utf8')
                s = s.decode('utf-8')
                new_data[new].append(s)
            except AttributeError:
                new_data[new].append(c)

    new_data = pd.DataFrame(new_data, columns=['Описание', 'Категория жалобы'])
    new_data.rename(columns={'Описание': 'request', 'Категория жалобы': 'report'}, inplace=True)
    new_data = new_data.dropna()  # dell nan
    if not duplicates:
        new_data = new_data.drop_duplicates()  # dell duplicates

    # как отдельную ветвь можно использовать
    if clean:
        delete_bad_symbols = lambda x: " ".join(re.sub('[^а-яa-zё0-9]', ' ', x.lower()).split())
        new_data['request'] = new_data['request'].apply(delete_bad_symbols)

    new_data = new_data.reset_index()

    return new_data

In [3]:
class Dataset(object):

    def __init__(self, data, seed=None, classes_description=None, *args, **kwargs):

        self.main_names = ['request', 'report']

        rs = random.getstate()
        random.seed(seed)
        self.random_state = random.getstate()
        random.setstate(rs)

        self.classes_description = classes_description
        self.data = dict()

        if data.get('train') is not None:
            self.data['train'] = data.get('train')
        elif data.get('test') is not None:
            self.data['test'] = data.get('test')
        elif data.get('valid') is not None:
            self.data['valid'] = data.get('valid')
        else:
            self.data['base'] = data

        self.classes = self.get_classes()
        self.classes_distribution = self.get_distribution()

    def simple_split(self, splitting_proportions, field_to_split, splitted_fields, delete_parent=True):
        data_to_div = self.data[field_to_split].copy()
        data_size = len(self.data[field_to_split])
        for i in range(len(splitted_fields) - 1):
            self.data[splitted_fields[i]], data_to_div = train_test_split(data_to_div,
                                                                          test_size=
                                                                          len(data_to_div) -
                                                                          int(data_size * splitting_proportions[i]))
        self.data[splitted_fields[-1]] = data_to_div

        if delete_parent:
            a = self.data.pop(field_to_split)
            del a

        return self

    def split(self, splitting_proportions=None, delete_parent=True):

        dd = dict()
        cd = self.classes_distribution
        train = list()
        valid = list()
        test = list()

        if splitting_proportions is None:
            splitting_proportions = [0.1, 0.1]

        if self.data.get('base', []) is not None:
            dataset = self.data['base']
        else:
            raise ValueError("You dataset don't contains 'base' key. If You want to split a specific part dataset,"
                             "please use .simple_split method.")

        for x, y in zip(dataset[self.main_names[0]], dataset[self.main_names[1]]):
            if y not in dd.keys():
                dd[y] = list()
                dd[y].append((x, y))
            else:
                dd[y].append((x, y))

        if type(splitting_proportions) is list:
            assert len(splitting_proportions) == 2
            assert type(splitting_proportions[0]) is float

            valid_ = dict()
            test_ = dict()

            for x in dd.keys():
                num = int(cd[x] * splitting_proportions[0])
                valid_[x] = random.sample(dd[x], num)
                [dd[x].remove(t) for t in valid_[x]]

            for x in dd.keys():
                num = int(cd[x] * splitting_proportions[1])
                test_[x] = random.sample(dd[x], num)
                [dd[x].remove(t) for t in test_[x]]
        else:
            raise ValueError('Split proportion must be list of floats, with length = 2')

        train_ = dd

        for x in train_.keys():
            for z_, z in zip([train_, valid_, test_], [train, valid, test]):
                z.extend(z_[x])

        del train_, valid_, test_, dd, cd, dataset

        for z in [train, valid, test]:
            z = random.shuffle(z)

        utrain, uvalid, utest, ctrain, cvalid, ctest = list(), list(), list(), list(), list(), list()
        for z, n, c in zip([train, valid, test], [utrain, uvalid, utest], [ctrain, cvalid, ctest]):
            for x in z:
                n.append(x[0])
                c.append(x[1])

        self.data['train'] = pd.DataFrame({self.main_names[0]: utrain, self.main_names[1]: ctrain})
        self.data['valid'] = pd.DataFrame({self.main_names[0]: uvalid, self.main_names[1]: cvalid})
        self.data['test'] = pd.DataFrame({self.main_names[0]: utest, self.main_names[1]: ctest})

        if delete_parent:
            a = self.data.pop('base', [])
            del a

        return self

    def iter_batch(self, batch_size: int, data_type: str = 'base') -> Generator:
        """This function returns a generator, which serves for generation of raw (no preprocessing such as tokenization)
         batches
        Args:
            batch_size (int): number of samples in batch
            data_type (str): can be either 'train', 'test', or 'valid'
        Returns:
            batch_gen (Generator): a generator, that iterates through the part (defined by data_type) of the dataset
        """
        data = self.data[data_type]
        data_len = len(data)
        order = list(range(data_len))

        rs = random.getstate()
        random.setstate(self.random_state)
        random.shuffle(order)
        self.random_state = random.getstate()
        random.setstate(rs)

        # for i in range((data_len - 1) // batch_size + 1):
        #     yield list(zip(*[data[o] for o in order[i * batch_size:(i + 1) * batch_size]]))
        for i in range((data_len - 1) // batch_size + 1):
            o = order[i * batch_size:(i + 1) * batch_size]
            yield list((list(data[self.main_names[0]][o]), list(data[self.main_names[1]][o])))

    def iter_all(self, data_type: str = 'base') -> Generator:
        """
        Iterate through all data. It can be used for building dictionary or
        Args:
            data_type (str): can be either 'train', 'test', or 'valid'
        Returns:
            samples_gen: a generator, that iterates through the all samples in the selected data type of the dataset
        """
        data = self.data[data_type]
        for x, y in zip(data[self.main_names[0]], data[self.main_names[1]]):
            yield (x, y)

    def merge_data(self, fields_to_merge, delete_parent=True, new_name=None):
        if new_name is None:
            new_name = '_'.join([s for s in fields_to_merge])

        if set(fields_to_merge) <= set(self.data.keys()):
            fraims_to_merge = [self.data[s] for s in fields_to_merge]
            self.data[new_name] = pd.concat(fraims_to_merge)
        else:
            raise KeyError('In dataset no such parts {}'.format(fields_to_merge))

        if delete_parent:
            a = [self.data.pop(x) for x in fields_to_merge]
            del a

        return self

    def del_data(self, fields_to_del):
        for name in fields_to_del:
            a = self.data.pop(name)
            del a
        return self

    def get_classes(self):
        if self.data.get('base') is not None:
            classes = self.data['base'][self.main_names[1]].unique()
        else:
            classes = self.data['train'][self.main_names[1]].unique()
        return classes

    def get_distribution(self):
        try:
            classes_distribution = self.data['base'].groupby(self.main_names[1])[self.main_names[0]].nunique()
        except KeyError:
            classes_distribution = self.data['train'].groupby(self.main_names[1])[self.main_names[0]].nunique()
        return classes_distribution

    def info(self):
        information = dict(data_keys=list(self.data.keys()),
                           classes_description=self.classes_description)

        return information

In [4]:
path = '/home/mks/projects/intent_classification_script/data/russian/data/vkusvill_all_categories.csv'
global_data = read_dataset(path)
dataset = Dataset(global_data, seed=42)
dataset = dataset.split()

  if self.run_code(code, result):


## Transformers

In [None]:
class BaseTransformer(object):
    def __init__(self, config=None):
        # info resist
        if not isinstance(config, dict):
            raise ValueError('Input config must be dict or None, but {} was found.'.format(type(config)))

        keys = ['op_type', 'name', 'request_names', 'new_names', 'input_x_type', 'input_y_type', 'output_x_type',
                'output_y_type']
        self.info = dict()
        for x in keys:
            if x not in config.keys():
                raise ValueError('Input config must contain {} key.'.format(x))
            self.info[x] = config[x]

        self.config = config

        # named spaces
        self.new_names = config['new_names']
        self.worked_names = config['request_names']
        self.request_names = []

    def _validate_names(self, dataset):
        if self.worked_names is not None:
            if not isinstance(self.worked_names, list):
                raise ValueError('Request_names must be a list, but {} was found.'.format(type(self.worked_names)))

            for name in self.worked_names:
                if name not in dataset.data.keys():
                    raise KeyError('Key {} not found in dataset.'.format(name))
                else:
                    self.request_names.append(name)
        else:
            self.worked_names = ['base', 'train', 'valid', 'test']
            for name in self.worked_names:
                if name in dataset.data.keys():
                    self.request_names.append(name)
            if len(self.request_names) == 0:
                raise KeyError('Keys from {} not found in dataset.'.format(self.worked_names))

        if self.new_names is None:
            self.new_names = self.request_names

        return self

    def _transform(self, dataset):
        return None

    def transform(self, dataset):
        self._validate_names(dataset)
        return self._transform(dataset)

    def get_params(self):
        return self.config

    def set_params(self, params):
        # self.params = params
        self.__init__(params)
        return self


class Speller(BaseTransformer):
    def __init__(self, config=None):
        if config is None:
            self.config = {'op_type': 'transformer',
                           'name': 'Speller',
                           'request_names': ['base'],
                           'new_names': ['base'],
                           'input_x_type': pd.core.series.Series,
                           'input_y_type': pd.core.series.Series,
                           'output_x_type': pd.core.series.Series,
                           'output_y_type': pd.core.series.Series,
                           'path': '/home/mks/projects/intent_classification_script/DeepPavlov/deeppavlov/configs/error_model/brillmoore_kartaslov_ru.json'}
        else:
            need_names = ['path']
            for name in need_names:
                if name not in config.keys():
                    raise ValueError('Input config must contain {}.'.format(name))

            self.config = config

        super().__init__(self.config)

        self.conf_path = self.config['path']
        with open(self.conf_path) as config_file:
            self.config = json.load(config_file)

        self.speller = build_model_from_config(self.config)

    def _transform(self, dataset):
        print('[ Speller start working ... ]')

        request, report = dataset.main_names
        for name, new_name in zip(self.request_names, self.new_names):
            data = dataset.data[name]
            refactor = list()

            for x in tqdm(data[request]):
                refactor.append(self.speller([x])[0])

            dataset.data[new_name] = pd.DataFrame({request: refactor,
                                                   report: data[report]})

        print('[ Speller done. ]')
        return dataset


class Tokenizer(BaseTransformer):
    def __init__(self, config=None):
        if config is None:
            self.config = {'op_type': 'transformer',
                           'name': 'Tokenizer',
                           'request_names': ['base'],
                           'new_names': ['base'],
                           'input_x_type': pd.core.series.Series,
                           'input_y_type': pd.core.series.Series,
                           'output_x_type': pd.core.series.Series,
                           'output_y_type': pd.core.series.Series}
        else:
            self.config = config

        super().__init__(self.config)

    def _transform(self, dataset):
        print('[ Starting tokenization ... ]')

        request, report = dataset.main_names
        for name, new_name in zip(self.request_names, self.new_names):
            data = dataset.data[name][request]
            tok_data = list()

            for x in tqdm(data):
                sent_toks = nltk.sent_tokenize(x)
                word_toks = [nltk.word_tokenize(el) for el in sent_toks]
                tokens = [val for sublist in word_toks for val in sublist]
                tok_data.append(tokens)

            dataset.data[new_name] = pd.DataFrame({request: tok_data,
                                                   report: dataset.data[name][report]})

        print('[ Tokenization was done. ]')
        return dataset


class Lemmatizer(BaseTransformer):
    def __init__(self, config=None):
        self.morph = pymorphy2.MorphAnalyzer()

        if config is None:
            self.config = {'op_type': 'transformer',
                           'name': 'Lemmatizer',
                           'request_names': ['base'],
                           'new_names': ['base'],
                           'input_x_type': pd.core.series.Series,
                           'input_y_type': pd.core.series.Series,
                           'output_x_type': pd.core.series.Series,
                           'output_y_type': pd.core.series.Series}
        else:
            self.config = config

        super().__init__(self.config)

    def _transform(self, dataset):
        print('[ Starting lemmatization ... ]')
        request, report = dataset.main_names
        for name, new_name in zip(self.request_names, self.new_names):
            data = dataset.data[name][request]
            morph_data = list()

            for x in tqdm(data):
                mp_data = [self.morph.parse(el)[0].normal_form for el in x]
                morph_data.append(mp_data)

            dataset.data[new_name] = pd.DataFrame({request: morph_data,
                                                   report: dataset.data[name][report]})
        print('[ Ended lemmatization. ]')
        return dataset


class FasttextVectorizer(BaseTransformer):
    def __init__(self, config=None):

        if config is None:
            self.config = {'op_type': 'vectorizer',
                           'name': 'fasttext',
                           'request_names': ['train', 'valid', 'test'],
                           'new_names': ['train_vec', 'valid_vec', 'test_vec'],
                           'input_x_type': pd.core.series.Series,
                           'input_y_type': pd.core.series.Series,
                           'output_x_type': pd.core.series.Series,
                           'output_y_type': pd.core.series.Series,
                           'path_to_model': '/home/mks/projects/intent_classification_script/data/russian/embeddings/ft_0.8.3_nltk_yalen_sg_300.bin',
                           'dimension': 300,
                           'file_type': 'bin'}
        else:
            need_names = ['path_to_model', 'dimension', 'file_type']
            for name in need_names:
                if name not in config.keys():
                    raise ValueError('Input config must contain {}.'.format(name))

            self.config = config

        super().__init__(self.config)

        self.vectorizer = fasttext.load_model(self.config['path_to_model'])

    def _transform(self, dataset):
        print('[ Starting vectorization ... ]')
        request, report = dataset.main_names

        for name, new_name in zip(self.request_names, self.new_names):
            print('[ Vectorization of {} part of dataset ... ]'.format(name))
            data = dataset.data[name][request]
            vec_request = []

            for x in tqdm(data):
                matrix_i = np.zeros((len(x), self.config['dimension']))
                for j, y in enumerate(x):
                    matrix_i[j] = self.vectorizer[y]
                vec_request.append(matrix_i)

            vec_report = list(labels2onehot_one(dataset.data[name][report], dataset.classes))

            dataset.data[new_name] = pd.DataFrame({request: vec_request,
                                                   report: vec_report})

        print('[ Vectorization was ended. ]')
        return dataset


class TextConcat(BaseTransformer):
    def __init__(self, config=None):
        if config is None:
            self.config = {'op_type': 'transformer',
                           'name': 'text_concatenator',
                           'request_names': ['base'],
                           'new_names': ['base'],
                           'input_x_type': pd.core.series.Series,
                           'input_y_type': pd.core.series.Series,
                           'output_x_type': pd.core.series.Series,
                           'output_y_type': pd.core.series.Series}
        else:
            need_names = []
            for name in need_names:
                if name not in config.keys():
                    raise ValueError('Input config must contain {}.'.format(name))

            self.config = config

        super().__init__(self.config)

    def _transform(self, dataset):
        print('[ Starting text merging ... ]')
        request, report = dataset.main_names

        for name, new_name in zip(self.request_names, self.new_names):
            data = dataset.data[name][request]
            text_request = []

            for x in tqdm(data):
                text_request.append(' '.join([z for z in x]))

            dataset.data[new_name] = pd.DataFrame({request: text_request,
                                                   report: dataset.data[name][report]})

        print('[ Text concatenation was ended. ]')
        return dataset

In [None]:
a = BaseTransformer({'op_type': 'transformer', 'name': 'estomator',
                     'request_names': ['base'], 'new_names': ['base'],
                     'input_x_type': pd.core.series.Series,
                     'input_y_type': pd.core.series.Series,
                     'output_x_type': pd.core.series.Series,
                     'output_y_type': pd.core.series.Series})

In [None]:
a._validate_names(dataset)

In [None]:
a.new_names

In [None]:
dataset.split()
dataset.data.keys()
req = ['test']

speller = Speller(info=['transformer', 'Speller'], request_names=req)
lemma = Lemmatizer(info=['transformer', 'Lemmatizator'], request_names=req)
tokenizer = Tokenizer(info=['transformer', 'Tokenizator'], request_names=req)

In [None]:
dataset_ = tokenizer.transform(dataset)
dataset_.data['test'].head()

In [None]:
fastvec = FasttextVectorizer()

In [None]:
dataset_ = fastvec.transform(dataset)

In [None]:
fastvec.request_names

In [None]:
fastvec.new_names

In [None]:
dataset_.data.keys()

## Sklearn wrapper

In [None]:
class skwrapper(BaseTransformer):
    def __init__(self, t, config=None):
        super().__init__(config)

        self.transformer = t()
        if not ((hasattr(t, "fit") or hasattr(t, "fit_transform")) or not hasattr(t, "transform")):
            raise TypeError("Methods fit, fit_transform, transform are not implemented in class {} "
                            "Sklearn transformers and estimators shoud implement fit and transform.".format(t))

        self.trained = False
        params = self.transformer.get_params()
        for key in params.keys():
            if key in self.config.keys():
                params[key] = self.config[key]

        self.transformer.set_params(**params)


class sktransformer(skwrapper):
    def __init__(self, t, config=None):
        super().__init__(t, config)

    def _transform(self, dataset):
        request, report = dataset.main_names
        if hasattr(self.transformer, 'fit_transform') and not self.trained:
            if 'base' not in dataset.data.keys():
                dataset.merge_data(fields_to_merge=self.request_names, delete_parent=False, new_name='base')
                X = dataset.data['base'][request]
                y = dataset.data['base'][report]
                # fit
                self.transformer.fit(X, y)
                self.trained = True

                # delete 'base' from dataset
                dataset.del_data(['base'])
            else:
                X = dataset.data['base'][request]
                y = dataset.data['base'][report]
                # fit
                self.transformer.fit(X, y)
                self.trained = True

            # transform all fields
            for name, new_name in zip(self.request_names, self.new_names):
                X = dataset.data[name][request]
                y = dataset.data[name][report]
                dataset.data[new_name] = {request: self.transformer.transform(X),
                                          report: y}

        else:
            for name, new_name in zip(self.request_names, self.new_names):
                X = dataset.data[name][request]
                y = dataset.data[name][report]
                dataset.data[new_name] = {request: self.transformer.transform(X),
                                          report: y}

        return dataset


class skmodel(skwrapper):
    def __init__(self, t, config=None):
        super().__init__(t, config)

    def fit(self, dataset):
        request, report = dataset.main_names

        if 'train_vec' in dataset.data.keys():
            name = 'train_vec'
        else:
            if 'train' in dataset.data.keys():
                name = 'train'
            else:
                raise KeyError('Dataset must contain "train_vec" or "train" fields.')

        X = dataset.data[name][request]
        y = dataset.data[name][report]

        if hasattr(self.transformer, 'fit') and not hasattr(self.transformer, 'fit_tranform'):
            self.transformer.fit(X, y)
            self.trained = True

        return self

    def predict(self, dataset, request_names=None, new_names=None):

        if not hasattr(self.transformer, 'predict'):
            raise TypeError("Methods predict, is not implemented in class {} "
                            " '%s' (type %s) doesn't" % (self.transformer, type(self.transformer)))

        request, report = dataset.main_names

        if not self.trained:
            raise AttributeError('Sklearn model is not trained yet.')

        if (request_names is not None) and (new_names):
            self.request_names = request_names
            self.new_names = new_names

        for name, new_name in zip(self.request_names, self.new_names):
            X = dataset.data[name][request]
            dataset.data[new_name] = self.transformer.predict(X)

        return dataset

    def fit_predict(self, dataset, request_names=None, new_names=None):
        self.fit(dataset)
        dataset = self.predict(dataset, request_names, new_names)
        return dataset

    def predict_data(self, dataset, request_names=None, new_names=None):

        if not hasattr(self.transformer, 'predict'):
            raise TypeError("Methods predict, is not implemented in class {} "
                            " '%s' (type %s) doesn't" % (self.transformer, type(self.transformer)))

        request, report = dataset.main_names

        if not self.trained:
            raise AttributeError('Sklearn model is not trained yet.')

        if (request_names is not None) and (new_names):
            self.request_names = request_names
            self.new_names = new_names

        res = []
        for name, new_name in zip(self.request_names, self.new_names):
            X = dataset.data[name][request]
            res.append(self.transformer.predict(X))

        return res

    def fit_predict_data(self, dataset, request_names=None, new_names=None):
        self.fit(dataset)
        res = self.predict_data(dataset, request_names, new_names)
        return res


In [None]:
import scipy
info1 = {'op_type': 'vectorizer', 'name': 'tf-idf vectorizer',
         'request_names': ['train', 'valid', 'test'], 'new_names': ['train_vec', 'valid_vec', 'test_vec'],
         'input_type': pd.core.series.Series,
         'output_type': scipy.sparse.csr.csr_matrix}
info2 = {'op_type': 'model', 'name': 'Linear Regression',
         'request_names': ['train_vec', 'valid_vec', 'test_vec'], 'new_names': ['valid_new', 'test_new'],
         'input_type': scipy.sparse.csr.csr_matrix,
         'output_type': None}

clf = tfidf
lr = LogisticRegression

In [None]:
my_vec = sktransformer(clf, config=info1)
my_lr = skmodel(lr, config=info2)

In [None]:
dataset.split()

dataset_ = my_vec.transform(dataset)
dataset_ = my_lr.fit(dataset_).predict(dataset_,
                                       request_names=['test_vec', 'valid_vec'],
                                       new_names=['test_new', 'valid_new'])

In [None]:
print(type(dataset_.data['test_vec']['request']))

## Class tree

In [None]:
def initialization(name, config=None):
    names = set(['speller', 'lemmatizer', 'tokenizer', 'fasttext_vectorizer',
                 'count_vectorizer', 'tf-idf'])
    
    if name not in names:
        raise TypeError('{} is not implemented.'.format(name))
    
    if name == 'speller':
        return Speller(config)
    elif name == 'lemmatizer':
        return Lemmatizer(config)
    elif name == 'tokenizer':
        return Tokenizer(config)
    elif name == 'fasttext_vectorizer':
        return FasttextVectorizer(config)
    elif name == 'tf-idf':
        return sktransformer(TfidfVectorizer(config))
    elif name == 'count_vectorizer':
        return sktransformer(CountVectorizer(config))
    else:
        return None

## Models

In [5]:
class BaseModel(object):
    def __init__(self, model, config):  # , fit_name=None, predict_names=None, new_names=None
        # config resist
        if not isinstance(config, dict):
            raise ValueError('Input config must be dict or None, but {} was found.'.format(type(config)))

        keys = ['op_type', 'name', 'fit_names', 'predict_names', 'new_names', 'input_x_type', 'input_y_type',
                'output_x_type', 'output_y_type']
        
        self.info = dict()
        
        for x in keys:
            if x not in config.keys():
                raise ValueError('Input config must contain {} key.'.format(x))
            self.info[x] = config[x]
        
        for x in keys:       
            if x == 'fit_names' or x == 'predict_names' or x == 'new_names':
                if not isinstance(config[x], list):
                    raise ValueError('Parameters fit_names, predict_names and new_names in config must be a list,'
                                     ' but {} "{}" was found.'.format(type(config[x]), config[x]))
            

        self.config = config
        self.trained = False
        self._validate_model(model)
        self.model = model

        # named spaces
        self.new_names = config['new_names']
        self.fit_names = config['fit_names']
        self.request_names = config['predict_names']

    def _validate_names(self, dataset):
        if self.fit_names is not None:
            for name in self.fit_names:
                if name not in dataset.data.keys():
                    raise KeyError('Key {} not found in dataset.'.format(name))
        else:
            raise KeyError('Parameter fit_names in config can not be None.')

        if self.request_names is not None:
            for name in self.request_names:
                if name not in dataset.data.keys():
                    raise KeyError('Key {} not found in dataset.'.format(name))

        return self

    def _validate_model(self, model):
        # need_atr = ['fit', 'predict', 'fit_predict', 'save', 'restore']
        # for atr in need_atr:
        #     if not hasattr(model, atr):
        #         raise AttributeError("Model don't supported {} method.".format(atr))

        if not (hasattr(model, 'fit') or hasattr(model, 'train')):
            raise AttributeError("Model don't supported fit or train methods method.")
        elif not (hasattr(model, 'restore') or hasattr(model, 'load')):
            raise AttributeError("Model don't supported restore or load methods method.")
        elif not hasattr(model, 'save'):
            raise AttributeError("Model don't supported save methods method.")
        elif not hasattr(model, 'predict'):
            raise AttributeError("Model don't supported predict or load methods method.")

        return self
    
    def init_model(self, dataset):
        self.model = self.model(self.config)
        return self

    def fit(self, dataset):
        self._validate_names(dataset)
        self.init_model(dataset)
        
        for name in self.fit_names:
            if hasattr(self.model, 'train'):
                self.model.train(dataset, name)
            if hasattr(self.model, 'fit'):
                self.model.fit(dataset, name)

        return self

    def predict(self, dataset):
        self._validate_names(dataset)
        self.init_model(dataset)
        if not self.trained:
            raise TypeError('Model is not trained yet.')
        for name, new_name in zip(self.request_names, self.new_names):
            dataset.data[new_name] = self.model.predict(dataset, name)

        return dataset

    def predict_data(self, dataset):
        self._validate_names(dataset)
        self.init_model(dataset)
        
        if not self.trained:
            raise TypeError('Model is not trained yet.')

        prediction = {}
        for name, new_name in zip(self.request_names, self.new_names):
            prediction[new_name] = self.model.predict(dataset, name)

        return prediction

    def fit_predict(self, dataset):
        self.fit(dataset)
        dataset = self.predict(dataset)
        return dataset

    def fit_predict_data(self, dataset):
        self.fit(dataset)
        prediction = self.predict_data(dataset)
        return prediction

    def get_params(self):
        return self.config

    def set_params(self, params):
        self.config = params
        return self

    def save(self, path=None):
        if path is not None:
            self.model.save(path)
        else:
            self.model.save()
        return self

    def restore(self, path=None):
        if path is not None:
            if isinstance(path, str):
                self.model.restore(path)
                self.trained = True
            else:
                raise TypeError('Restore path must be str, but {} was found.'.format(type(path)))
        else:
            self.model.restore()
            self.trained = True

        return self

## Test of models wrapper

In [6]:
class CNN(BaseModel):
    def init_model(self, dataset):
        classes = dataset.get_classes()
        classes = ' '.join([str(x) for x in classes])
        self.config['classes'] = classes
        
        self.model = self.model(self.config)
        return self

In [7]:
from intent_classifier.intent_model.model_wrap import KerasMulticlassModel

In [8]:
with open('/home/mks/projects/intent_classification_script/configs/models/CNN/CNN_opt.json', 'r') as conf:
    config = json.load(conf)

In [9]:
config

{'batch_size': 64,
 'coef_reg_cnn': 0.0001,
 'coef_reg_den': 0.0001,
 'confident_threshold': 0.5,
 'dense_size': 100,
 'dropout_rate': 0.5,
 'embedding_size': 300,
 'epochs': 1,
 'fasttext_model': './reddit_fasttext_model.bin',
 'filters_cnn': 256,
 'kernel_sizes_cnn': '1 2 3',
 'lear_metrics': 'binary_accuracy fmeasure',
 'lear_rate': 0.1,
 'lear_rate_decay': 0.1,
 'loss': 'binary_crossentropy',
 'model_from_saved': False,
 'model_name': 'cnn_model',
 'model_path': './cnn_model_0',
 'module': 'fasttext',
 'optimizer': 'Adam',
 'show_examples': False,
 'text_size': 15,
 'val_every_n_epochs': 5,
 'val_patience': 5,
 'verbose': True}

In [10]:
keys = ['op_type', 'name', 'fit_names', 'predict_names', 'new_names', 'input_x_type', 'input_y_type',
        'output_x_type', 'output_y_type']

config['op_type'] = 'model'
config['name'] = 'cnn'
config['fit_names'] = ['train']
config['predict_names'] = ['test']
config['new_names'] = ['predicted_test']
config['input_x_type'] = str(pd.Series)
config['input_y_type'] = str(pd.Series)
config['output_x_type'] = str(pd.Series)
config['output_y_type'] = str(pd.Series)

In [11]:
config['fasttext_model'] = './data/russian/embeddings/ft_0.8.3_nltk_yalen_sg_300.bin'

In [12]:
for x in keys:
    if x == 'fit_names' or x == 'predict_names' or x == 'new_names':
        if not isinstance(config[x], list):
            print(type(config[x]))

In [13]:
model = CNN(KerasMulticlassModel, config)

In [14]:
model.fit(dataset)


____Training over 36221 samples____


train -->	updates: 1	loss: 0.9130334258079529	binary_accuracy: 0.5	fmeasure: 0.08417507261037827	 
train -->	updates: 501	loss: 0.18026073276996613	binary_accuracy: 0.9586396813392639	fmeasure: 0.47058820724487305	 
epochs_done: 1


FileExistsError: [Errno 17] File exists: 'cnn_model_0'

## Pipeline

In [None]:
class Pipeline(object):
    def __init__(self, pipe):
        self.pipe = pipe
    
    def fit(self, dataset):

        for op in self.pipe:
            operation = op[1]
            if operation is not None:
                if operation.info['op_type'] == 'transformer':
                    dataset = operation.transform(dataset)
                elif operation.info['op_type'] == 'vectorizer':
                    if 'train' not in dataset.data.keys():
                        dataset.split()
                    operation.transform(dataset)
                elif operation.info['op_type'] == 'model':
                    operation.fit(dataset)
            else:
                pass
        
        print('[ Train End. ]')

        return self

    def predict(self, dataset):
        prediction = None

        for op in self.pipe:
            operation = op[1]
            if operation is not None:
                if operation.info['op_type'] == 'transformer':
                    dataset = operation.transform(dataset)
                elif operation.info['op_type'] == 'vectorizer':
                    if 'train' not in dataset.data.keys():
                        dataset.split()
                    operation.transform(dataset)
                elif operation.info['op_type'] == 'model':
                    prediction = operation.predict(dataset)
            else:
                pass
        
        print('[ Prediction End. ]')
        
        return prediction
    
    def run(self, fit_dataset, predict_dataset=None):
        self.fit(fit_dataset)
        if predict_dataset is None:
            prediction = self.predict(fit_dataset)
        else:
            prediction = self.predict(predict_dataset)
        
        print('[ End. ]')
        
        return prediction

## pipeline test

In [None]:
path = '/home/mks/projects/intent_classification_script/data/russian/data/vkusvill_all_categories.csv'
global_data = read_dataset(path)
dataset = Dataset(global_data, seed=42)
dataset = dataset.split()

In [None]:
dataset.data['train'] = dataset.data['valid']
dataset.del_data(['valid'])
dataset.data.keys()

In [None]:
req = ['train', 'test']
info1 = {'op_type': 'vectorizer', 'name': 'tf-idf vectorizer'}
info2 = {'op_type': 'model', 'name': 'Linear Regression'}

In [None]:
clf = tfidf()
lr = LogisticRegression()

my_vec = sktransformer(clf, info=info1,
                       request_names=req,
                       new_names=['train_vec', 'test_vec'])
my_lr = skmodel(lr, info=info2, request_names=req)

In [None]:
speller = Speller(info=['transformer', 'Speller'], request_names=req)
lemma = Lemmatizer(info=['transformer', 'Lemmatizator'], request_names=req)
tokenizer = Tokenizer(info=['transformer', 'Tokenizator'], request_names=req)
fastvec = FasttextVectorizer(info=['vectorizer', 'fasttext'], request_names=req,
                             new_names=['train_vec', 'test_vec'])

In [None]:
pipe = [('tf-idf', my_vec), ('LR', my_lr)]

In [None]:
pipeline = Pipeline(pipe)

In [None]:
pipeline = pipeline.fit(dataset).predict(dataset)

In [None]:
dataset_.data['test'].head()

## tiny test

In [None]:
tiny_test_data = pd.DataFrame({'request': ['Это тут', "типо такой", "тест офигенный"],
                               'report': [0, 1, 2]})
tiny_test_dataset = Dataset(tiny_test_data)
tiny_test_dataset.data.keys()

In [None]:
tiny_dataset = speller.transform(tiny_test_dataset)

In [None]:
tiny_dataset = tokenizer.transform(tiny_test_dataset)

In [None]:
tiny_dataset = lemma.transform(tiny_test_dataset)

In [None]:
# tiny_dataset.split()
tiny_dataset = fastvec.transform(tiny_test_dataset)

In [None]:
tiny_dataset.data.keys()

In [None]:
tiny_dataset.data['base']