In [None]:
from script.core.transformers import *
from script.core.models import skmodel, sktransformer, BaseModel
from script.core.dataset import Dataset
from script.core.utils import read_dataset, get_result
from script.core.pipeline import BasePipeline

# linear models
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

# sklearn feachure extractors
from sklearn.feature_extraction.text import CountVectorizer as count
from sklearn.feature_extraction.text import TfidfVectorizer as tfidf
from sklearn.preprocessing import LabelEncoder

from intent_classifier.intent_model.model_wrap import KerasMulticlassModel as DilyaraModel
from script.models.cnn import CNN
from script.models.dcnn import DCNN

from script.core.utils import logging
import datetime

# Read csv file and create Dataset

In [None]:
path = './data/russian/data/vkusvill_all_categories.csv'
global_data = read_dataset(path)
dataset = Dataset(global_data, seed=42)
dataset = dataset.split([0.1, 0.1])

print(dataset.data.keys())
print(len(dataset.data['valid']))

data = dataset.data['test']
dataset = Dataset(data, seed=42)
print(dataset.data.keys())

# Operations

In [None]:
spl_conf = {'op_type': 'transformer',
            'name': 'Speller',
            'request_names': ['base'],
            'new_names': ['base'],
            'path': './DeepPavlov/deeppavlov/configs/error_model/brillmoore_kartaslov_ru.json'}

tok_conf = {'op_type': 'transformer',
            'name': 'Tokenizer',
            'request_names': ['base'],
            'new_names': ['base']}

lem_conf = {'op_type': 'transformer',
            'name': 'Lemmatizer',
            'request_names': ['base'],
            'new_names': ['base']}

concat = TextConcat()

tfidf_conf_1 = {'op_type': 'vectorizer', 'name': 'tf-idf vectorizer',
                'request_names': ['train', 'valid', 'test'], 'new_names': ['train_vec', 'valid_vec', 'test_vec']}
tfidf_conf_2 = {'op_type': 'vectorizer', 'name': 'tf-idf_vectorizer',
                'request_names': ['train', 'valid', 'test'], 'new_names': ['train_vec', 'valid_vec', 'test_vec']}
tfidf_ = sktransformer(tfidf, tfidf_conf_1)

# Neural Models

In [None]:
class GetCNN(BaseModel):
    def init_model(self, dataset):
        classes = dataset.get_classes()
        classes = ' '.join([str(x) for x in classes])
        self.config['classes'] = classes
        
        super().init_model(dataset)
        
        return self

def get_cnn_conf(path, emb_path, fit_names=None, predict_names=None, new_names=None):
    with open(path, 'r') as conf:
        config = json.load(conf)

    config['op_type'] = 'model'
    config['name'] = 'cnn'
    
    if fit_names is not None:
        config['fit_names'] = fit_names
    else:
        config['fit_names'] = ['train_vec']
    
    if predict_names is not None:
        config['predict_names'] = predict_names
    else:
        config['predict_names'] = ['test_vec']
        
    if new_names is not None:
        config['new_names'] = new_names
    else:
        config['new_names'] = ['predicted_test']
    
    config['fasttext_model'] = emb_path
    
    return config

path_0 = './configs/models/CNN/CNN_opt.json'
path_1 = './configs/models/CNN/cnn.json'
emb_path = './data/russian/embeddings/ft_0.8.3_nltk_yalen_sg_300.bin'

# config = get_cnn_conf(path_0, emb_path, fit_names=['train'], predict_names=['test'])
# model = GetCNN(DilyaraModel, config)

config = get_cnn_conf(path_1, emb_path)
model = GetCNN(CNN, config)

# config = get_cnn_conf(path_1, emb_path)
# model = GetCNN(DCNN, config)

# Neural pipelines

## Run pipeline with neural model

In [None]:
tok_conf = {'op_type': 'transformer',
            'name': 'Tokenizer',
            'request_names': ['base'],  # 'train', 'valid', 'test'
            'new_names': ['base']}
lem_conf = {'op_type': 'transformer',
            'name': 'Lemmatizator',
            'request_names': ['base'],
            'new_names': ['base']}
con_conf = {'op_type': 'transformer',
            'name': 'Concatenizer',
            'request_names': ['base'],
            'new_names': ['base']}

# (Speller, spl_conf),  (Lemmatizer, lem_conf), (concat, con_conf), 


neuro_pipe_0 = [(Tokenizer, tok_conf), (FasttextVectorizer,), (model,)]
pipeline_8 = BasePipeline(neuro_pipe_0, mode='train', output=None)
pipeline_9 = BasePipeline(neuro_pipe_0, mode='infer', output='dataset')

data_ = pipeline_9.run(dataset)
# data_.data.keys()
# data_.data['predicted_test']

# Linear Models

In [None]:
conf_0 = {'op_type': 'model', 'name': 'Linear Regression',
          'fit_names': ['train_vec'], 'new_names': ['predicted_test'],
          'predict_names': ['test_vec']}
LogisticRegression = skmodel(LogisticRegression, conf_0)
LGBMClassifier = skmodel(LGBMClassifier, conf_0)
LinearSVC = skmodel(LinearSVC, conf_0)
RandomForestClassifier = skmodel(RandomForestClassifier, conf_0)

## Linear pipelines

## Run pipeline with linear regression

In [None]:
# (Tokenizer, tok_conf), (Lemmatizer,), (concat, None), 
pipe_0 = [(tfidf_, tfidf_conf_2), (LogisticRegression,), (GetResultLinear,)]
pipeline_0 = BasePipeline(pipe_0, mode='train', output=None)
pipeline_1 = BasePipeline(pipe_0, mode='infer', output='dataset')

In [None]:
res = pipeline_1.run(dataset)

In [None]:
conf = pipeline_1.pipeline_config
conf

In [None]:
res.data['results']

In [None]:
conf = dict(conf)

In [None]:
date = datetime.datetime.now()
name = '{}-{}-{}.txt'.format(date.year, date.month, date.day)
logging(res.data['results'], conf, name)

## Run pipeline with SVC

In [None]:
path = './data/russian/data/vkusvill_all_categories.csv'
global_data = read_dataset(path)
dataset = Dataset(global_data, seed=42)
dataset = dataset.split(splitting_proportions=[0.99, 0.01])

print(dataset.data.keys())
print(len(dataset.data['test']))

data = dataset.data['test']
dataset = Dataset(data, seed=42)
print(dataset.data.keys())

pipe_1 = [(Speller, spl_conf), (Tokenizer, tok_conf), (Lemmatizer,), (concat, None), (tfidf_, tfidf_conf_2),
          (LinearSVC,)]
pipeline_2 = BasePipeline(pipe_1, mode='train', output=None)
pipeline_3 = BasePipeline(pipe_1, mode='infer', output='dataset')

data_ = pipeline_3.run(dataset)
data_.data.keys()
data_.data['test_new']

##  Run pipeline with Random Forest

In [None]:
path = './data/russian/data/vkusvill_all_categories.csv'
global_data = read_dataset(path)
dataset = Dataset(global_data, seed=42)
dataset = dataset.split(splitting_proportions=[0.99, 0.01])

print(dataset.data.keys())
print(len(dataset.data['test']))

data = dataset.data['test']
dataset = Dataset(data, seed=42)
print(dataset.data.keys())

pipe_2 = [(Speller, spl_conf), (Tokenizer, tok_conf), (Lemmatizer,), (concat, None), (tfidf_, tfidf_conf_2),
          (RandomForestClassifier,)]
pipeline_4 = BasePipeline(pipe_2, mode='train', output=None)
pipeline_5 = BasePipeline(pipe_2, mode='infer', output='dataset')

data_ = pipeline_5.run(dataset)
data_.data.keys()
data_.data['test_new']

## Run pipeline with GBM

In [None]:
path = './data/russian/data/vkusvill_all_categories.csv'
global_data = read_dataset(path)
dataset = Dataset(global_data, seed=42)
dataset = dataset.split(splitting_proportions=[0.99, 0.01])

print(dataset.data.keys())
print(len(dataset.data['test']))

data = dataset.data['test']
dataset = Dataset(data, seed=42)
print(dataset.data.keys())

pipe_3 = [(Speller, spl_conf), (Tokenizer, tok_conf), (Lemmatizer,), (concat, None), (tfidf_, tfidf_conf_2),
          (LGBMClassifier,)]
pipeline_6 = BasePipeline(pipe_3, mode='train', output=None)
pipeline_7 = BasePipeline(pipe_3, mode='infer', output='dataset')

data_ = pipeline_7.run(dataset)
data_.data.keys()
data_.data['test_new']