In [None]:
from script.core.transformers import *
from script.core.models import skmodel, sktransformer, BaseModel
from script.core.dataset import Watcher
from script.core.utils import read_dataset, get_result
from script.core.pipeline import PrepPipeline

# linear models
from sklearn.svm import LinearSVC
# sklearn feachure extractors
from sklearn.feature_extraction.text import TfidfVectorizer as tfidf

from script.core.utils import logging
import datetime

# Read csv file and create Dataset

In [None]:
path = './data/russian/data/vkusvill_all_categories.csv'
global_data = read_dataset(path)
dataset = Dataset(global_data, seed=42)
dataset = dataset.split([0.1, 0.1])

print(dataset.data.keys())
print(len(dataset.data['valid']))

data = dataset.data['test']
dataset = Dataset(data, seed=42)
print(dataset.data.keys())

# Operations

In [None]:
spl_conf = {'op_type': 'transformer',
            'name': 'Speller',
            'request_names': ['base'],
            'new_names': ['base'],
            'path': './DeepPavlov/deeppavlov/configs/error_model/brillmoore_kartaslov_ru.json'}

tok_conf = {'op_type': 'transformer',
            'name': 'Tokenizer',
            'request_names': ['base'],
            'new_names': ['base']}

lem_conf = {'op_type': 'transformer',
            'name': 'Lemmatizer',
            'request_names': ['base'],
            'new_names': ['base']}

concat = TextConcat()

tfidf_conf_1 = {'op_type': 'vectorizer', 'name': 'tf-idf vectorizer',
                'request_names': ['train', 'valid', 'test'], 'new_names': ['train_vec', 'valid_vec', 'test_vec']}
tfidf_conf_2 = {'op_type': 'vectorizer', 'name': 'tf-idf_vectorizer',
                'request_names': ['train', 'valid', 'test'], 'new_names': ['train_vec', 'valid_vec', 'test_vec']}
tfidf_ = sktransformer(tfidf, tfidf_conf_1)

# Linear Models

In [None]:
conf_0 = {'op_type': 'model', 'name': 'Linear Regression',
          'fit_names': ['train_vec'], 'new_names': ['predicted_test'],
          'predict_names': ['test_vec']}
LogisticRegression = skmodel(LogisticRegression, conf_0)
LGBMClassifier = skmodel(LGBMClassifier, conf_0)
LinearSVC = skmodel(LinearSVC, conf_0)
RandomForestClassifier = skmodel(RandomForestClassifier, conf_0)

## Run pipeline with SVC

In [None]:
path = './data/russian/data/vkusvill_all_categories.csv'
global_data = read_dataset(path)
dataset = Dataset(global_data, seed=42)
dataset = dataset.split(splitting_proportions=[0.99, 0.01])

print(dataset.data.keys())
print(len(dataset.data['test']))

data = dataset.data['test']
dataset = Dataset(data, seed=42)
print(dataset.data.keys())

pipe_1 = [(Speller, spl_conf), (Tokenizer, tok_conf), (Lemmatizer,), (concat, None), (tfidf_, tfidf_conf_2),
          (LinearSVC,)]
pipeline_2 = BasePipeline(pipe_1, mode='train', output=None)
pipeline_3 = BasePipeline(pipe_1, mode='infer', output='dataset')

data_ = pipeline_3.run(dataset)
data_.data.keys()
data_.data['test_new']