In [1]:
from sklearn.base import TransformerMixin, BaseEstimator


class LogTransformerr(TransformerMixin, BaseEstimator):
    """
    Log transforming
    """

    def fit(self, X):
        return self

    def transform(self, X):
        return np.log(X + 1)

In [None]:
from typing import Optional, Union

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import FLOAT_DTYPES

ArrayLike = Union[np.ndarray, pd.DataFrame]


class CustomLogTransformer(BaseEstimator, TransformerMixin):
    """
    Stupid log transformer to proof a concept
    """

    def __init__(self, copy: bool = True) -> None:
        self.copy = copy

    def fit(
        self, X: ArrayLike, y: Optional[np.ndarray] = None
    ) -> "CustomLogTransformer":
        return self

    def transform(
        self, X: ArrayLike, y: Optional[np.ndarray] = None, copy: Optional[bool] = None
    ):
        copy = copy or self.copy
        X_new = self._validate_data(
            X,
            reset=False,
            estimator=self,
            dtype=FLOAT_DTYPES,
            force_all_finite="allow-nan",
            copy=copy,
        )
        X_new -= np.min(X_new, axis=0, keepdims=True)
        X_new = np.log1p(X_new)
        return X_new

    def fit_transform(self, X: ArrayLike, y: Optional[np.ndarray] = None) -> ArrayLike:
        return self.fit(X, y).transform(X, y)

# Imports

In [2]:
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from yaml import load as yaml_load, FullLoader
import pickle
import sklearn
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, LinearRegression, PassiveAggressiveClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, \
    StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, classification_report, \
    precision_recall_curve, precision_recall_fscore_support, plot_precision_recall_curve, plot_roc_curve, \
    precision_score, recall_score, make_scorer

In [24]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [4]:
# import os, sys
# path = os.path.abspath(os.path.join('../transformer/'))
# sys.path.append(path)

In [5]:
from functools import partial
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
from hyperopt.pyll import scope

import ntpath


# random state
RS = 42

## Preprocessing define

In [6]:
df = pd.read_csv('data/raw/heart_cleveland_upload.csv')
df_train, df_oos = train_test_split(df, stratify=df['condition'], test_size=0.25, random_state=42)
df.shape, df_train.shape, df_oos.shape

((297, 14), (222, 14), (75, 14))

In [7]:
df_train.to_csv('data/raw/train.csv', index=False)
df_oos.to_csv('data/raw/oos.csv', index=False)

In [8]:
X_train, y_train = df_train.drop(columns=['condition']), df_train['condition']

## Hyper parameters search

In [44]:
def build_model(clf_name, classifiers, r_state):
    """
    Формируем модель с указанными гиперпараметрами 
    """ 
    
    if clf_name == 'logreg':
        classifier = LogisticRegression(class_weight='balanced',
                                        solver='saga',
                                        n_jobs=-1,
                                        random_state=r_state)
        
    elif clf_name == 'naive_bayes':
        classifier = MultinomialNB()
        
    else: 
        raise KeyError('Unknown classifier: {}'.format(clf_name))
        
    
    search_space = classifiers[clf_name]
    
    model = Pipeline([
                            ('log_scaler', LogTransformerr()),
                            ('clf', classifier)
    ])
    
    return model, search_space

In [45]:
param_range_fl = [1.0, 0.5]

logreg_params = [{'clf__penalty': ['l1', 'l2'],
                'clf__C': param_range_fl,
                'clf__solver': ['liblinear']}] 


In [46]:
naive_bayes_params = {
                    'clf__alpha': hp.loguniform('alpha', -4*np.log(10), 2*np.log(10)),
                    'clf__fit_prior': [True, False]
}

In [47]:
classifiers = {'logreg': logreg_params, 
               'naive_bayes': naive_bayes_params}

In [48]:
classifiers

{'logreg': [{'clf__penalty': ['l1', 'l2'],
   'clf__C': [1.0, 0.5],
   'clf__solver': ['liblinear']}],
 'naive_bayes': {'clf__alpha': <hyperopt.pyll.base.Apply at 0x1caf905daf0>,
  'clf__fit_prior': [True, False]}}

## logreg

In [51]:
SEED=42
RS=42

In [52]:
model, search_space = build_model('logreg', classifiers, r_state=RS)

In [54]:
model

Pipeline(steps=[('log_scaler', LogTransformerr()),
                ('clf',
                 LogisticRegression(class_weight='balanced', n_jobs=-1,
                                    random_state=42, solver='saga'))])

In [55]:
LR = GridSearchCV(estimator=model,
            param_grid=search_space,
            scoring='accuracy',
            cv=10) 

In [58]:
LR

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('log_scaler', LogTransformerr()),
                                       ('clf',
                                        LogisticRegression(class_weight='balanced',
                                                           n_jobs=-1,
                                                           random_state=42,
                                                           solver='saga'))]),
             param_grid=[{'clf__C': [1.0, 0.5], 'clf__penalty': ['l1', 'l2'],
                          'clf__solver': ['liblinear']}],
             scoring='accuracy')

In [57]:
# print('\nEstimator: %s' % grid_dict[idx])
LR.fit(X_train, y_train)
print('Best params are : %s' % LR.best_params_)
# Best training data accuracy
print('Best training accuracy: %.3f' % LR.best_score_)
# Predict on test data with best params
y_pred = LR.predict(x_test)
# Test data accuracy of model with best params
print('Test set accuracy score for best params: %.3f ' % accuracy_score(y_test, y_pred))
# Track best (highest test accuracy) model

Traceback (most recent call last):
  File "C:\Users\Irina\AppData\Local\Continuum\anaconda3\envs\news\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Irina\AppData\Local\Continuum\anaconda3\envs\news\lib\site-packages\sklearn\pipeline.py", line 341, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\Irina\AppData\Local\Continuum\anaconda3\envs\news\lib\site-packages\sklearn\pipeline.py", line 303, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "C:\Users\Irina\AppData\Local\Continuum\anaconda3\envs\news\lib\site-packages\joblib\memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "C:\Users\Irina\AppData\Local\Continuum\anaconda3\envs\news\lib\site-packages\sklearn\pipeline.py", line 754, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "C:\Users\Irina\AppData\Local\Continuum\anacond

TypeError: fit() takes 2 positional arguments but 3 were given

In [30]:
best

NameError: name 'best' is not defined

In [27]:
min(trials.losses())

0.4097697337648637

In [28]:
i = trials.losses().index(min(trials.losses()))
params = trials.results[i]['params']
params

{'clf__C': 40.756082062867726,
 'clf__max_iter': 750,
 'clf__penalty': 'l2',
 'digit_eraser__enabled': False,
 'punct_eraser__enabled': False,
 'vec__analyzer': 'char_wb',
 'vec__binary': False,
 'vec__max_df': 0.7136875223969704,
 'vec__min_df': 0.003366616013693943,
 'vec__ngram_range': (3, 5),
 'vec__norm': 'l2',
 'word_pipeline__word_additional_stopwords__enabled': True,
 'word_pipeline__word_normalizer__enabled': False,
 'word_pipeline__word_oneletter_stopwords__enabled': True,
 'word_pipeline__word_standard_stopwords__enabled': False,
 'word_pipeline__word_synonyms__enabled': False}

In [29]:
model.set_params(**params)

SwitchPipeline(enabled_steps={'clf': True, 'digit_eraser': False,
                              'punct_eraser': False, 'vec': True,
                              'word_pipeline': True},
               steps=[('digit_eraser', DigitEraser()),
                      ('punct_eraser', PunctuationEraser()),
                      ('word_pipeline',
                       SwitchPipeline(enabled_steps={'word_additional_stopwords': True,
                                                     'word_joiner': True,
                                                     'word_normalizer': False,
                                                     'word_oneletter_stopwords': True,
                                                     'wor...
                                                                     'гуманный': ['чело',
                                                                                  'человечески',
                                                                                  

In [30]:
model.fit(X_train, y_train)

SwitchPipeline(enabled_steps={'clf': True, 'digit_eraser': False,
                              'punct_eraser': False, 'vec': True,
                              'word_pipeline': True},
               steps=[('digit_eraser', DigitEraser()),
                      ('punct_eraser', PunctuationEraser()),
                      ('word_pipeline',
                       SwitchPipeline(enabled_steps={'word_additional_stopwords': True,
                                                     'word_joiner': True,
                                                     'word_normalizer': False,
                                                     'word_oneletter_stopwords': True,
                                                     'wor...
                                                                     'гуманный': ['чело',
                                                                                  'человечески',
                                                                                  

In [32]:
preds_train = model.predict(X_train)
preds_oos = model.predict(X_oos)
# preds_oot = model.predict(X_oot)

In [33]:
train_f1_macro = f1_score(y_train, preds_train, average='macro')
oos_f1_macro = f1_score(y_oos, preds_oos, average='macro')
# oot_f1_macro = f1_score(y_oot, preds_oot, average='macro')

In [34]:
print('Train F1 macro=', train_f1_macro)
# print('Cross Validation F1 macro=', model.best_score_, 'std=', model.cv_results_['std_test_score'][model.best_index_])
print('Out of sample F1 macro=', oos_f1_macro)
# print('Out of time F1 macro=', oot_f1_macro)

Train F1 macro= 0.9915278849115484
Out of sample F1 macro= 0.8759892191854124


In [36]:
with open('./model/' + path_leaf(path_train)[:-5] + '_model.pkl', 'wb') as f:
    pickle.dump(model, f)



# Out of sample classification report

In [37]:
print(classification_report(y_oos, preds_oos))

                         precision    recall  f1-score   support

     DFLM_NLP_AKTIVFPSU       0.92      0.95      0.94        64
        DFLM_NLP_INKASS       0.91      0.95      0.93        76
  DFLM_NLP_INKASSFORSLM       0.96      0.92      0.94        49
          DFLM_NLP_MCTP       0.85      0.87      0.86        45
            DFLM_NLP_OK       0.90      0.88      0.89       178
         DFLM_NLP_OTHER       0.81      0.72      0.76       158
     DFLM_NLP_PROVAIDER       0.84      0.90      0.87        48
DFLM_NLP_READYFORUPLOAD       0.90      0.98      0.94        64
        DFLM_NLP_SBOYPO       0.76      0.86      0.81        44
           DFLM_NLP_SLM       0.81      0.85      0.83        54

               accuracy                           0.87       780
              macro avg       0.87      0.89      0.88       780
           weighted avg       0.87      0.87      0.87       780





## naive_bayes

In [31]:
naive_bayes, naive_bayes_search_space = build_model('naive_bayes', classifiers, common_space, r_state=RS)

NameError: name 'MultinomialNB' is not defined

In [43]:
RS=1
# запускаем hyperopt
trials = Trials()
best = fmin( 
          # функция для оптимизации  
            fn=partial(objective, pipeline=naive_bayes, X_train=X_train, y_train=y_train, r_state=RS),
          # пространство поиска гиперпараметров  
            space=naive_bayes_search_space,
          # алгоритм поиска
            algo=tpe.suggest,
          # число итераций 
          # (можно ещё указать и время поиска) 
            max_evals=30,
          # куда сохранять историю поиска
            trials=trials,
          # random state
            rstate=np.random.RandomState(RS),
          # progressbar
            show_progressbar=True
          )

100%|██████████████████████████████████████████████████| 30/30 [02:17<00:00,  4.60s/trial, best loss: 1.078981484321384]


In [44]:
i = trials.losses().index(min(trials.losses()))
naive_bayes_params = trials.results[i]['params']

In [45]:
naive_bayes.set_params(**naive_bayes_params)
naive_bayes.fit(X_train, y_train)

SwitchPipeline(enabled_steps={'clf': True, 'digit_eraser': False,
                              'punct_eraser': False, 'vec': True,
                              'word_pipeline': True},
               steps=[('digit_eraser', DigitEraser()),
                      ('punct_eraser', PunctuationEraser()),
                      ('word_pipeline',
                       SwitchPipeline(enabled_steps={'word_additional_stopwords': False,
                                                     'word_joiner': True,
                                                     'word_normalizer': False,
                                                     'word_oneletter_stopwords': True,
                                                     'wo...
                                                                                'графика',
                                                                                'дежурство',
                                                                                'дежу

In [46]:
train_f1_macro = f1_score(y_train, naive_bayes.predict(X_train), average='macro')
oos_f1_macro = f1_score(y_oos, naive_bayes.predict(X_oos), average='macro')
print(train_f1_macro, oos_f1_macro)

0.7061534069762041 0.6784736659799117
