In [115]:
import json
import re
import pandas as pd
import nltk
import pymorphy2
import fasttext
import numpy as np
from tqdm import tqdm

from utils import labels2onehot_one
from deeppavlov.core.commands.infer import build_model_from_config
from dataset import Dataset
from transformer import Speller, Tokenizer, Lemmatizer, FasttextVectorizer
morph = pymorphy2.MorphAnalyzer()

# linear models
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

# sklearn feachure extractors
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

2018-03-20 21:26:32.191 INFO in 'pymorphy2.opencorpora_dict.wrapper'['wrapper'] at line 16: Loading dictionaries from /home/mks/envs/intent_script/lib/python3.6/site-packages/pymorphy2_dicts/data
2018-03-20 21:26:32.223 INFO in 'pymorphy2.opencorpora_dict.wrapper'['wrapper'] at line 20: format: 2.4, revision: 393442, updated: 2015-01-17T16:03:56.586168


## Dataset

In [2]:
def read_dataset(filepath, duplicates=False, clean=True):
    file = open(filepath, 'r', encoding='ISO-8859-1')
    data = pd.read_csv(file)

    old_names = data.keys()
    names = [n.encode('ISO-8859-1').decode('cp1251').encode('utf8') for n in old_names]
    names = [n.decode('utf-8') for n in names]

    new_data = dict()
    for old, new in zip(old_names, names):
        new_data[new] = list()
        for c in data[old]:
            try:
                s = c.encode('ISO-8859-1').decode('cp1251').encode('utf8')
                s = s.decode('utf-8')
                new_data[new].append(s)
            except AttributeError:
                new_data[new].append(c)

    new_data = pd.DataFrame(new_data, columns=['Описание', 'Категория жалобы'])
    new_data.rename(columns={'Описание': 'request', 'Категория жалобы': 'report'}, inplace=True)
    new_data = new_data.dropna()  # dell nan
    if not duplicates:
        new_data = new_data.drop_duplicates()  # dell duplicates

    # как отдельную ветвь можно использовать
    if clean:
        delete_bad_symbols = lambda x: " ".join(re.sub('[^а-яa-zё0-9]', ' ', x.lower()).split())
        new_data['request'] = new_data['request'].apply(delete_bad_symbols)

    new_data = new_data.reset_index()

    return new_data

In [117]:
import random
import pandas as pd
from typing import Generator
from sklearn.model_selection import train_test_split


class Dataset(object):

    def __init__(self, data, seed=None, classes_description=None, *args, **kwargs):

        self.main_names = ['request', 'report']

        rs = random.getstate()
        random.seed(seed)
        self.random_state = random.getstate()
        random.setstate(rs)

        self.classes_description = classes_description
        self.data = dict()

        if data.get('train') is not None:
            self.data['train'] = data.get('train')
        elif data.get('test') is not None:
            self.data['test'] = data.get('test')
        elif data.get('valid') is not None:
            self.data['valid'] = data.get('valid')
        else:
            self.data['base'] = data

        self.classes = self.get_classes()
        self.classes_distribution = self.get_distribution()

    def simple_split(self, splitting_proportions, field_to_split, splitted_fields, delete_parent=True):
        data_to_div = self.data[field_to_split].copy()
        data_size = len(self.data[field_to_split])
        for i in range(len(splitted_fields) - 1):
            self.data[splitted_fields[i]], data_to_div = train_test_split(data_to_div,
                                                                          test_size=
                                                                          len(data_to_div) -
                                                                          int(data_size * splitting_proportions[i]))
        self.data[splitted_fields[-1]] = data_to_div

        if delete_parent:
            a = self.data.pop(field_to_split)
            del a

        return self

    def split(self, splitting_proportions=None, delete_parent=True):

        dd = dict()
        cd = self.classes_distribution
        train = list()
        valid = list()
        test = list()

        if splitting_proportions is None:
            splitting_proportions = [0.1, 0.1]

        if self.data.get('base', []) is not None:
            dataset = self.data['base']
        else:
            raise ValueError("You dataset don't contains 'base' key. If You want to split a specific part dataset,"
                             "please use .simple_split method.")

        for x, y in zip(dataset[self.main_names[0]], dataset[self.main_names[1]]):
            if y not in dd.keys():
                dd[y] = list()
                dd[y].append((x, y))
            else:
                dd[y].append((x, y))

        if type(splitting_proportions) is list:
            assert len(splitting_proportions) == 2
            assert type(splitting_proportions[0]) is float

            valid_ = dict()
            test_ = dict()

            for x in dd.keys():
                num = int(cd[x] * splitting_proportions[0])
                valid_[x] = random.sample(dd[x], num)
                [dd[x].remove(t) for t in valid_[x]]

            for x in dd.keys():
                num = int(cd[x] * splitting_proportions[1])
                test_[x] = random.sample(dd[x], num)
                [dd[x].remove(t) for t in test_[x]]
        else:
            raise ValueError('Split proportion must be list of floats, with length = 2')

        train_ = dd

        for x in train_.keys():
            for z_, z in zip([train_, valid_, test_], [train, valid, test]):
                z.extend(z_[x])

        del train_, valid_, test_, dd, cd, dataset

        for z in [train, valid, test]:
            z = random.shuffle(z)

        utrain, uvalid, utest, ctrain, cvalid, ctest = list(), list(), list(), list(), list(), list()
        for z, n, c in zip([train, valid, test], [utrain, uvalid, utest], [ctrain, cvalid, ctest]):
            for x in z:
                n.append(x[0])
                c.append(x[1])

        self.data['train'] = pd.DataFrame({self.main_names[0]: utrain, self.main_names[1]: ctrain})
        self.data['valid'] = pd.DataFrame({self.main_names[0]: uvalid, self.main_names[1]: cvalid})
        self.data['test'] = pd.DataFrame({self.main_names[0]: utest, self.main_names[1]: ctest})

        if delete_parent:
            a = self.data.pop('base', [])
            del a

        return self

    def iter_batch(self, batch_size: int, data_type: str = 'base') -> Generator:
        """This function returns a generator, which serves for generation of raw (no preprocessing such as tokenization)
         batches
        Args:
            batch_size (int): number of samples in batch
            data_type (str): can be either 'train', 'test', or 'valid'
        Returns:
            batch_gen (Generator): a generator, that iterates through the part (defined by data_type) of the dataset
        """
        data = self.data[data_type]
        data_len = len(data)
        order = list(range(data_len))

        rs = random.getstate()
        random.setstate(self.random_state)
        random.shuffle(order)
        self.random_state = random.getstate()
        random.setstate(rs)

        # for i in range((data_len - 1) // batch_size + 1):
        #     yield list(zip(*[data[o] for o in order[i * batch_size:(i + 1) * batch_size]]))
        for i in range((data_len - 1) // batch_size + 1):
            o = order[i * batch_size:(i + 1) * batch_size]
            yield list((list(data[self.main_names[0]][o]), list(data[self.main_names[1]][o])))

    def iter_all(self, data_type: str = 'base') -> Generator:
        """
        Iterate through all data. It can be used for building dictionary or
        Args:
            data_type (str): can be either 'train', 'test', or 'valid'
        Returns:
            samples_gen: a generator, that iterates through the all samples in the selected data type of the dataset
        """
        data = self.data[data_type]
        for x, y in zip(data[self.main_names[0]], data[self.main_names[1]]):
            yield (x, y)

    def merge_data(self, fields_to_merge, delete_parent=True, new_name=None):
        if new_name is None:
            new_name = '_'.join([s for s in fields_to_merge])

        if set(fields_to_merge) <= set(self.data.keys()):
            fraims_to_merge = [self.data[s] for s in fields_to_merge]
            self.data[new_name] = pd.concat(fraims_to_merge)
        else:
            raise KeyError('In dataset no such parts {}'.format(fields_to_merge))

        if delete_parent:
            a = [self.data.pop(x) for x in fields_to_merge]
            del a

        return self

    def del_data(self, fields_to_del):
        for name in fields_to_del:
            a = self.data.pop(name)
            del a
        return self

    def get_classes(self):
        if self.data.get('base') is not None:
            classes = self.data['base'][self.main_names[1]].unique()
        else:
            classes = self.data['train'][self.main_names[1]].unique()
        return classes

    def get_distribution(self):
        try:
            classes_distribution = self.data['base'].groupby(self.main_names[1])[self.main_names[0]].nunique()
        except KeyError:
            classes_distribution = self.data['train'].groupby(self.main_names[1])[self.main_names[0]].nunique()
        return classes_distribution

    def info(self):
        information = dict(data_keys=list(self.data.keys()),
                           classes_description=self.classes_description)

        return information

In [118]:
path = '/home/mks/projects/intent_classification_script/data/russian/data/vkusvill_all_categories.csv'
global_data = read_dataset(path)
dataset = Dataset(global_data, seed=42)
dataset = dataset.split()

  if self.run_code(code, result):


## Transformers

In [48]:
class BaseTransformer(object):
    def __init__(self):
        self.info = dict(type='transformer')

    def get_params(self):
        return self.params

    def set_params(self, params):
        # self.params = params
        self.__init__(params)
        return self


class Speller(BaseTransformer):
    def __init__(self, params=None):
        self.info = BaseTransformer().info
        
        if params is None:
            self.conf_path = '/home/mks/projects/intent_classification_script/DeepPavlov/deeppavlov/configs/error_model/brillmoore_kartaslov_ru.json'
        else:
            self.conf_path = params

        with open(self.conf_path) as config_file:
            self.config = json.load(config_file)

        self.speller = build_model_from_config(self.config)

    def transform(self, dataset, name='base'):
        
        print('[Speller start working ... ]')
        
        names = dataset.main_names
        data = dataset.data[name]

        refactor = list()
        for x in tqdm(data[names[0]]):
            refactor.append(self.speller([x])[0])

        dataset.data[name] = pd.DataFrame({names[0]: refactor,
                                           names[1]: data[names[1]]})
        
        print('[Speller done. ]')
        
        return dataset


class Tokenizer(BaseTransformer):
    def __init__(self, params=None):
        self.params = params
        self.info = BaseTransformer().info

    def transform(self, dataset, name='base'):
        
        print('[Starting tokenization ... ]')
        
        names = dataset.main_names
        data = dataset.data[name][names[0]]

        tok_data = list()
        for x in tqdm(data):
            sent_toks = nltk.sent_tokenize(x)
            word_toks = [nltk.word_tokenize(el) for el in sent_toks]
            tokens = [val for sublist in word_toks for val in sublist]
            tok_data.append(tokens)

        dataset.data[name] = pd.DataFrame({names[0]: tok_data,
                                           names[1]: dataset.data[name][names[1]]})
        
        print('[Tokenization was done. ]')
        
        return dataset


class Lemmatizer(BaseTransformer):
    def __init__(self, params=None):
        self.params = params
        self.morph = pymorphy2.MorphAnalyzer()
        self.info = BaseTransformer().info

    def transform(self, dataset, name='base'):
        print('[Starting lemmatization ... ]')
        names = dataset.main_names
        data = dataset.data[name][names[0]]

        morph_data = list()
        for x in tqdm(data):
            mp_data = [self.morph.parse(el)[0].normal_form for el in x]
            morph_data.append(mp_data)

        dataset.data[name] = pd.DataFrame({names[0]: morph_data,
                                           names[1]: dataset.data[name][names[1]]})
        print('[Ended lemmatization. ]')
        return dataset


class FasttextVectorizer(BaseTransformer):
    # TODO необходимо прописать логику того что векторизация может быть только после разделения датасета
    def __init__(self, params=None):
        if params is None:
            self.params = {'path_to_model': '/home/mks/projects/intent_classification_script/data/russian/embeddings/ft_0.8.3_nltk_yalen_sg_300.bin',
                           'dimension': 300,
                           'file_type': 'bin'}

        self.vectorizer = fasttext.load_model(self.params['path_to_model'])
        self.info = BaseTransformer().info

    def transform(self, dataset, name='base'):
        
        print('[Starting vectorization ... ]')
        
        names = dataset.main_names
        data = dataset.data[name][names[0]]

        vec_request = []
        for x in tqdm(data):
            matrix_i = np.zeros((len(x), self.params['dimension']))
            for j, y in enumerate(x):
                matrix_i[j] = self.vectorizer[y]
            vec_request.append(matrix_i)

        vec_report = list(labels2onehot_one(dataset.data[name][names[1]], dataset.classes))

        dataset.data[name] = pd.DataFrame({names[0]: vec_request,
                                           names[1]: vec_report})
        
        print('[Vectorization was end. ]')
        
        return dataset

## Sklearn wrapper

In [113]:
class skwrapper(object):
    def __init__(self, t, old_names=None, new_names=None):
        
        if (not hasattr(t, "fit" or not (hasattr(t, "fit_transform")) or hasattr(t, "transform"))):
            raise TypeError("Methods fit, fit_transform, transform are not implemented in class {} "
                            "Sklearn transformers and estimators shoud implement fit and transform."
                            " '%s' (type %s) doesn't" % (t, type(t)))
        
        self.transformer = t
        self.trained = False
        self.info = None
        
        if old_names is None:
            self.old_names = ['train', 'valid', 'test']
        else:
            self.old_names = old_names
        if new_names is None:
            self.new_names = self.old_names
        else:
            self.new_names = new_names


class sktransformer_wrapper(skwrapper):  
    def __init__(self, t, info, old_names=None, new_names=None):
        self.info = info
        super().__init__(t, old_names, new_names)
    
    def transform(self, dataset):
        request, report = dataset.main_names
        
        if hasattr(self.transformer, 'fit_transform'):
            if 'base' not in dataset.data.keys():
                dataset.merge_data(fields_to_merge=self.old_names, delete_parent=False, new_name='base')
            X = dataset.data['base'][request]
            y = dataset.data['base'][report]
            self.transformer.fit(X, y)
            self.trained = True
            
            # delete 'base' from dataset
            dataset.del_data(['base'])
            
            # transform all fields
            for name, new_name in zip(self.old_names, self.new_names):
                X = dataset.data[name][request]
                y = dataset.data[name][report]            
                dataset.data[new_name] = {report: self.transformer.transform(X),
                                          report: y}
                
        else:     
            for name, new_name in zip(self.old_names, self.new_names):
                X = dataset.data[name][request]
                y = dataset.data[name][report]            
                dataset.data[new_name] = {report: self.transformer.transform(X),
                                          report: y}
            
        return dataset

    
class skmodel_wrapper(skwrapper):
    def __init__(self, t, info, old_names=None, new_names=None):
        self.info = info
        super().__init__(t, old_names, new_names)
    
    def fit(self, dataset):
        self.old_names = ['train']
        request, report = dataset.main_names
        
        for name in self.old_names:
            X = dataset.data[name][request]
            y = dataset.data[name][report]

            if hasattr(self.transformer, 'fit') and not hasattr(self.transformer, 'fit_tranform'):
                self.transformer.fit(X, y)
                self.trained = True
        
        return self
    
    def predict(self, dataset):
        
        if not hasattr(self.transformer, 'predict'):
            raise TypeError("Methods predict, is not implemented in class {} "
                            " '%s' (type %s) doesn't" % (self.transformer, type(self.transformer)))
        
        request, report = dataset.main_names
        
        if not self.trained:
            # TODO write correct error
            raise ValueError('Sklearn model is not trained yet.')
        
        for name, new_name in zip(self.old_names, self.new_names):
            X = dataset.data[name][request]
#             y = dataset.data[name][report]

            dataset.data[new_name] = self.transformer.predict(X)
        
        return dataset
    
    def fit_predict(self, dataset):      
        self.fit(dataset)
        dataset = self.predict(dataset)
        return dataset

In [120]:
info1 = {'op_type': 'vectorizer', 'name': 'tf-idf vectorizer'}
info2 = {'op_type': 'model', 'name': 'Linear Regression'}

clf = tfidf()
my_vec = sktransformer_wrapper(clf, info=info1)
lr = LogisticRegression()
my_lr = skmodel_wrapper(lr, info=info2)

dataset_ = my_vec.transform(dataset)
# dataset_ = my_lr.fit(dataset_, old_name='vec').predict(dataset_, old_name='vec', new_name='new')

dataset_.data['train']

TypeError: cannot concatenate object of type "<class 'dict'>"; only pd.Series, pd.DataFrame, and pd.Panel (deprecated) objs are valid

In [127]:
dataset_.data.keys()

dict_keys(['train', 'valid', 'test'])

## Class tree

In [97]:
def initialization(name, config=None):
    names = set(['speller', 'lemmatizer', 'tokenizer', 'fasttext_vectorizer',
                 'count_vectorizer', 'tf-idf'])
    
    if name not in names:
        raise TypeError('{} is not implemented.'.format(name))
    
    if name == 'speller':
        return Speller(config)
    elif name == 'lemmatizer':
        return Lemmatizer(config)
    elif name == 'tokenizer':
        return Tokenizer(config)
    elif name == 'fasttext_vectorizer':
        return FasttextVectorizer(config)
    elif name == 'tf-idf':
        return skwrapper(TfidfVectorizer(config))
    elif name == 'count_vectorizer':
        return skwrapper(CountVectorizer(config))
    else:
        return None

In [5]:
a = initialization('speller')
a

2018-03-20 14:48:19.730 INFO in 'deeppavlov.vocabs.typos'['typos'] at line 76: Loading a dictionary from /home/mks/projects/intent_classification_script/DeepPavlov/download/russian_words_vocab
2018-03-20 14:48:25.510 INFO in 'deeppavlov.models.spellers.error_model.error_model'['error_model'] at line 239: loading error_model from `/home/mks/projects/intent_classification_script/DeepPavlov/download/error_model/error_model_ru.tsv`


## Pipeline

In [64]:
class Pipeline(object):
    def __init__(self, pipe):
        self.pipe = pipe
    
    def fit(self, dataset, name, **fit_params):

        for op in self.pipe:
            operation = op[1]
            if operation is not None:
                if operation.info['type'] == 'transformer':
                    dataset = operation.transform(dataset, name=name)
                elif operation.info['type'] == 'model':
                    operation.init(dataset)
                    operation.fit()
            else:
                pass

        return self, dataset

    def predict(self, dataset, name, **fit_params):
        prediction = None

        for op in self.pipe:
            operation = op[1]
            if operation is not None:
                if operation.info['type'] == 'transformer':
                    dataset = operation.transform(dataset, name=name)
                elif operation.info['type'] == 'model':
                    operation.init(dataset)
                    prediction = operation.predict(dataset)
            else:
                pass

        return prediction

In [None]:
pipe1 = [('Speller', Speller()), ('Tokenizer', Tokenizer())]
pipe2 = [('Lemmatizer', Lemmatizer()), ('Vectorizer', FasttextVectorizer())]
pipe3 = [('Tokenizer', Tokenizer()), ('Lemmatizer', Lemmatizer()), ('Vectorizer', FasttextVectorizer())]

In [50]:
pipeline = Pipeline(pipe3)

In [58]:
pipeline, dataset_ = pipeline.fit(dataset, 'test')

 10%|▉         | 442/4475 [00:00<00:00, 4416.78it/s]

[Starting tokenization ... ]


100%|██████████| 4475/4475 [00:00<00:00, 5160.24it/s]
  0%|          | 11/4475 [00:00<00:40, 109.77it/s]

[Tokenization was done. ]
[Starting lemmatization ... ]


100%|██████████| 4475/4475 [00:26<00:00, 166.19it/s]
  4%|▎         | 162/4475 [00:00<00:02, 1606.20it/s]

[Ended lemmatization. ]
[Starting vectorization ... ]


100%|██████████| 4475/4475 [00:02<00:00, 1734.46it/s]

[Vectorization was end. ]





In [59]:
dataset_.data['test'].head()

Unnamed: 0,report,request
0,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[[-0.18812957406044006, -0.0639662817120552, -..."
1,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[0.12598323822021484, 0.1466168761253357, -0...."
2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[-0.5365741848945618, 0.01435030810534954, 0...."
3,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[[-0.2726789116859436, 0.21540087461471558, -0..."
4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[0.06901668012142181, 0.14644865691661835, -0..."


## Models