In [8]:
from script.core.transformers import *
from script.core.models import skmodel, sktransformer, BaseModel
from script.core.dataset import Dataset
from script.core.utils import read_dataset, get_result
from script.core.pipeline import PrepPipeline, Pipeline

# linear models
from sklearn.svm import LinearSVC
# sklearn feachure extractors
from sklearn.feature_extraction.text import TfidfVectorizer as tfidf

from script.core.utils import logging
import datetime
from os.path import join, isfile

import secrets
import os

# Read csv file and create Dataset

In [14]:
class Watcher(Dataset):
    def __init__(self, data, date, language, dataset_name, seed=None, classes_description=None, root=None,
                 *args, **kwargs):

        super().__init__(data, seed, classes_description, *args, **kwargs)

        self.date = '{}-{}-{}'.format(date.year, date.month, date.day)

        if root is None:
            root = '/home/mks/projects/intent_classification_script/'

        self.conf_dict = join(root, 'data', language, dataset_name, 'log_data')
        self.save_path = join(self.conf_dict, 'data')

    def test_config(self, conf):
        self.add_config(conf)
        status = self.check_config(self.pipeline_config)

        if isinstance(status, bool):
            if status:
                # self.save_data(self.pipeline_config)
                return False
        elif isinstance(status, str):
            self.load_data(status)
            return True
        else:
            print(type(status))
            raise ValueError('Incorrect')

        return self

    def check_config(self, conf):
        with open(join(self.conf_dict, 'pipe_conf_dict.json'), 'r+') as d:
            conf_ = json.load(d)

            if len(list(conf_.keys())) == 0:
                d.close()
                return True
            else:
                coincidence = False
                for name in conf_.keys():
                    if conf_[name] == conf:
                        coincidence = True
                        d.close()
                        return name
                if not coincidence:
                    d.close()
                    return True
        return None

    def save_data(self):
        names = self.data.keys()
        dataframes = []
        datanames = []
        for name in names:
            if isinstance(self.data[name], pd.DataFrame):
                dataframes.append(self.data[name])
                datanames.append(name)
        data = pd.concat(dataframes, keys=datanames)

        # saving in file
        secret_name = secrets.token_hex(nbytes=16)

        if not os.path.isdir(self.save_path):
            os.makedirs(self.save_path)

        path = join(self.save_path, secret_name)  # + '.csv'
        data.to_csv(path)

        # write in conf_dict.json
        if isfile(join(self.conf_dict, 'pipe_conf_dict.json')):
            with open(join(self.conf_dict, 'pipe_conf_dict.json'), 'r') as d:
                conf_ = json.load(d)
                d.close()

            conf_[secret_name] = self.pipeline_config
            with open(join(self.conf_dict, 'pipe_conf_dict.json'), 'w') as d:
                line = json.dumps(conf_)
                d.write(line)
                d.close()

        else:
            conf_ = dict()
            conf_[secret_name] = self.pipeline_config
            with open(join(self.conf_dict, 'pipe_conf_dict.json'), 'w') as d:
                line = json.dumps(conf_)
                d.write(line)
                d.close()

        return self

    def load_data(self, name):
        filepath = join(self.save_path, name)
        file = open(filepath, 'r')
        data = pd.read_csv(file)
        file.close()

        request, report = self.main_names

        keys = list(data['Unnamed: 0'].unique())
        data_keys = list(self.data.keys())

        for key in keys:
            if key not in data_keys:
                self.data[key] = {}
            self.data[key][request] = data[data['Unnamed: 0'] == key][request]
            self.data[key][report] = data[data['Unnamed: 0'] == key][report]

        for key in data_keys:
            if key not in keys:
                self.del_data([key])

        del data

        return self

In [29]:
def init_dataset_tiny(file_path, language, dataset_name, date, seed=42):
    pure_data = read_dataset(file_path, True, True)  # It not default meanings!!!
    start_dataset = Watcher(pure_data, date, language, dataset_name,
                            seed=seed)  # classes_descriptions = {} we can do it

    ######################################################################################
    dataset = start_dataset.split([0.1, 0.1])
    data = dataset.data['test']
    start_dataset = Watcher(data, date, language, dataset_name, seed)
    ######################################################################################

    return start_dataset

date = datetime.datetime.now()
dataset_name = 'vkusvill'
language = 'russian'
file_path = join('./data', language, dataset_name, 'data', 'vkusvill_all_categories.csv')

dataset = init_dataset_tiny(file_path, language, dataset_name, date)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
print(dataset.data.keys())

In [22]:
dataset.load_data('d90f5f6b59dbff9deec6116641b5973e')

<__main__.Watcher at 0x7fa043003f28>

In [32]:
print(dataset.data.keys())

dict_keys(['train', 'valid', 'test', 'train_vec', 'valid_vec', 'test_vec'])


In [33]:
print(dataset.data['train_vec'])

{'request': <3667x23139 sparse matrix of type '<class 'numpy.float64'>'
	with 107315 stored elements in Compressed Sparse Row format>, 'report': 0       15
1        3
2       10
3        7
4       13
5        2
6        3
7       12
8        7
9        1
10       6
11       6
12      10
13       4
14       7
15       6
16       3
17       2
18       6
19       6
20       6
21       3
22       7
23      15
24       3
25       1
26      13
27       1
28       6
29       1
        ..
3637     3
3638    11
3639     6
3640    11
3641    13
3642     3
3643    10
3644    10
3645     3
3646     7
3647     6
3648     6
3649    11
3650     1
3651     7
3652     1
3653    12
3654    16
3655    13
3656     2
3657     3
3658    15
3659     2
3660    12
3661     3
3662     7
3663     1
3664     6
3665     6
3666    12
Name: report, Length: 3667, dtype: int64}


In [31]:
dataset.split()
dataset = tfidf_.transform(dataset)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [34]:
dataset.save_data()

<__main__.Watcher at 0x7fa06d74b2e8>

In [35]:
dataset.data.keys()

dict_keys(['train', 'valid', 'test', 'train_vec', 'valid_vec', 'test_vec'])

# Operations

In [26]:
spl_conf = {'op_type': 'transformer',
            'name': 'Speller',
            'request_names': ['base'],
            'new_names': ['base'],
            'path': './DeepPavlov/deeppavlov/configs/error_model/brillmoore_kartaslov_ru.json'}

tok_conf = {'op_type': 'transformer',
            'name': 'Tokenizer',
            'request_names': ['base'],
            'new_names': ['base']}

lem_conf = {'op_type': 'transformer',
            'name': 'Lemmatizer',
            'request_names': ['base'],
            'new_names': ['base']}

concat = TextConcat()

tfidf_conf_1 = {'op_type': 'vectorizer', 'name': 'tf-idf vectorizer',
                'request_names': ['train', 'valid', 'test'], 'new_names': ['train_vec', 'valid_vec', 'test_vec']}
tfidf_conf_2 = {'op_type': 'vectorizer', 'name': 'tf-idf_vectorizer',
                'request_names': ['train', 'valid', 'test'], 'new_names': ['train_vec', 'valid_vec', 'test_vec']}
tfidf_ = sktransformer(tfidf, tfidf_conf_1)

# Linear Models

In [5]:
conf_0 = {'op_type': 'model', 'name': 'Linear SVC',
          'fit_names': ['train_vec'], 'new_names': ['predicted_test'],
          'predict_names': ['test_vec']}

LinearSVC = skmodel(LinearSVC, conf_0)

In [11]:
pipe_1 = [(Speller, ), (Tokenizer, ), (Lemmatizer,), (concat, ), (tfidf_, )]
pipeline_1 = PrepPipeline(pipe_1)

pipe_2 = [(Speller, ), (Tokenizer, ), (Lemmatizer,), (concat, ), (tfidf_, )]
pipeline_2 = PrepPipeline(pipe_2)