<a href="https://colab.research.google.com/github/iamzager/Karelia_churn/blob/cleaning/Kareila_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
TOKEN = ''
URL = f'https://iamzager:{TOKEN}@github.com/iamzager/Karelia_churn.git'

In [2]:
!git init
!git pull $URL cleaning
# !git config --global user.email ''
!git config --global user.name 'iamzager'
!git remote add origin $URL

Initialized empty Git repository in /content/.git/
remote: Enumerating objects: 280, done.[K
remote: Counting objects: 100% (98/98), done.[K
remote: Compressing objects: 100% (59/59), done.[K
remote: Total 280 (delta 45), reused 77 (delta 39), pack-reused 182[K
Receiving objects: 100% (280/280), 113.15 MiB | 21.27 MiB/s, done.
Resolving deltas: 100% (102/102), done.
From https://github.com/iamzager/Karelia_churn
 * branch            cleaning   -> FETCH_HEAD


In [3]:
!pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting optuna
  Downloading optuna-3.0.2-py3-none-any.whl (348 kB)
[K     |████████████████████████████████| 348 kB 21.1 MB/s 
[?25hCollecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting alembic>=1.5.0
  Downloading alembic-1.8.1-py3-none-any.whl (209 kB)
[K     |████████████████████████████████| 209 kB 63.1 MB/s 
Collecting cmaes>=0.8.2
  Downloading cmaes-0.8.2-py3-none-any.whl (15 kB)
Collecting cliff
  Downloading cliff-3.10.1-py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 10.5 MB/s 
Collecting Mako
  Downloading Mako-1.2.3-py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 7.6 MB/s 
Collecting stevedore>=2.0.1
  Downloading stevedore-3.5.0-py3-none-any.whl (49 kB)
[K     |████████████████████████████████| 49 kB 6.1 MB/s 
[?25hCollecting cmd2>=1.0.0
  Downloading cmd2-2.4.2-py3-none-any.whl (1

In [4]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold, train_test_split
from sklearn.metrics import recall_score, classification_report
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import LocalOutlierFactor
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import optuna

from lightgbm import LGBMClassifier

import json
import gc
import re

In [5]:
RANDOM_STATE = 17
FEATURES_FILE_NAME = 'X_for_tuning.csv'
BEST_PARAMS_FILE_NAME = 'best_params.json'
META_BEST_PARAMS_FILE_NAME = 'meta_best_params.json'

In [6]:
def scoring(estimator, X, y_true):
  preds = estimator.predict(X)
  return recall_score(y_true, preds, average='macro')
def validate(model, X, y, cv, random_state=RANDOM_STATE):    
    cv_results = cross_validate(
        model,
        X,
        y,
        cv=cv,
        n_jobs=-1,
        scoring=scoring,
        return_train_score=True
        )
    print('test: ', cv_results['test_score'].mean(), cv_results['test_score'].std())
    print('train: ', cv_results['train_score'].mean(), cv_results['train_score'].std())
    print('diff: ',
        (cv_results['train_score'] - cv_results['test_score']).mean(),
        (cv_results['train_score'] - cv_results['test_score']).std()
        )
    return cv_results['test_score']
def clean(X, estimator):    
    mask = (estimator.fit_predict(X) == 1)
    print(f'{round(1 - (mask.sum() / mask.shape[0]), 2)}%, {mask.shape[0] - mask.sum()} штук выбросов')
    return mask

In [7]:
# !git pull origin 
X = pd.read_csv(FEATURES_FILE_NAME).set_index('contract_id')
categorical_features = np.r_[
    X.columns.values[X.columns.str.contains('has')],\
    ['day_or_month_contract']
]
other_features = set(X.columns) - set(categorical_features)
X[categorical_features] = X[categorical_features].astype('category')

X_train, y_train = X.drop('blocked', axis=1), X['blocked']
del X
gc.collect()

0

In [8]:
log = pd.read_csv('support_log.csv', parse_dates=['event_date'])
competitors = pd.read_csv('dns_log.csv', parse_dates=['date'], index_col=0).reset_index(drop=True)
competitors['date'] = competitors['date'].dt.date
events = pd.concat(
    [
        competitors[['date', 'url', 'contract_id']],
        log[['event_date', 'event_type', 'contract_id']].rename({'event_date':'date'},axis=1)
    ], axis=0
    )
events['event'] =  events['event_type'].fillna(events['url'])
events = events.drop(['url', 'event_type'], axis=1).sort_values(by=['contract_id', 'date'])
events_corpus = events.groupby('contract_id')['event']\
    .apply(lambda x : '__'.join(x))\
    .reindex(X_train.index)\
    .fillna('Nothing')
del events, log, competitors
gc.collect()

11

- Сначала подбираются параметры для мета-алгоритма (SGDClassifier в файле Karelia_main) одновременно с соответствующим TfidfVectorizer
- С использованием полученных параметров META_BEST_PARAMS_FILE_NAME рассчитываются основные признаки FEATURES_FILE_NAME
- Затем подбираются параметры основного алгоритма BEST_PARAMS_FILE_NAME одновременно с алгоритмом отбора выбросов

# meta

In [9]:
def optimize(objective_func, study_name, n_trials, study=None):
    print('start optimize')
    if study is None:
        study = optuna.create_study(direction='maximize', study_name=study_name,\
						            sampler=optuna.samplers.TPESampler())
    study.optimize(objective_func, n_trials=n_trials, show_progress_bar=True, n_jobs=1)
    print('done optimize')
    return study

In [10]:
def sgd_vec(trial):
    vec_params = {
        'tokenizer' : lambda s: s.split('__'),
        'ngram_range' : (1, trial.suggest_int('ngram_max', 1, 3)),
        'max_features' : trial.suggest_categorical(
            'max_features',
            [None] + list(range(100, 1000, 100)) + list(range(1500, 5000, 500))
            )
    }
    sgd_params = {
        'loss' : 'log',
        'random_state' : RANDOM_STATE,
        'class_weight' : 'balanced',
        'alpha' : trial.suggest_float('alpha', 1e-5, 1e-1),
        'tol' : trial.suggest_float('tol', 1e-4, 1e-1),
        'average' : trial.suggest_categorical('average', [True, False])
    }

    vec = TfidfVectorizer(**vec_params)
    events_vec = vec.fit_transform(events_corpus)
    sgd = SGDClassifier(**sgd_params)

    score = cross_val_score(sgd, X_train, y_train, n_jobs=-1,\
                            cv=cv_split, scoring=scoring)    
    return np.mean(score)

In [11]:
cv_split = StratifiedKFold(10, shuffle=True, random_state=RANDOM_STATE)

In [12]:
# На практике необходимо примерно 200 итераций
n_iterations = 10
%time sgd_result = optimize(sgd_vec, 'sgd_vec', n_iterations)

[32m[I 2022-10-08 19:37:23,537][0m A new study created in memory with name: sgd_vec[0m


start optimize


  self._init_valid()


  0%|          | 0/10 [00:00<?, ?it/s]

[32m[I 2022-10-08 19:37:25,469][0m Trial 0 finished with value: 0.6350998595393502 and parameters: {'ngram_max': 3, 'max_features': 3000, 'alpha': 0.05912455613871675, 'tol': 0.012298611068583293, 'average': True}. Best is trial 0 with value: 0.6350998595393502.[0m
[32m[I 2022-10-08 19:37:26,085][0m Trial 1 finished with value: 0.6331467516563996 and parameters: {'ngram_max': 3, 'max_features': 600, 'alpha': 0.0901504919027494, 'tol': 0.007992277947989415, 'average': True}. Best is trial 0 with value: 0.6350998595393502.[0m
[32m[I 2022-10-08 19:37:26,497][0m Trial 2 finished with value: 0.6248462151204957 and parameters: {'ngram_max': 2, 'max_features': 3000, 'alpha': 0.0381446928461468, 'tol': 0.0633628624595961, 'average': False}. Best is trial 0 with value: 0.6350998595393502.[0m
[32m[I 2022-10-08 19:37:27,103][0m Trial 3 finished with value: 0.62172837933547 and parameters: {'ngram_max': 2, 'max_features': None, 'alpha': 0.09872468866460105, 'tol': 0.09066314757416101, '

In [13]:
temp = dict(sgd_result.best_params)
meta_best_params = {
    'vec_params' : {'ngram_range':(1, temp.pop('ngram_max')), 'max_features':temp.pop('max_features')},
    'sgd_params' : temp
}

In [14]:
# with open(META_BEST_PARAMS_FILE_NAME, 'w') as f:
#     json.dump(meta_best_params, f)
# !git add $META_BEST_PARAMS_FILE_NAME
# !git commit -m 'updated tuned params'
# !git push origin 

# main

In [15]:
def lgb_regularization(trial):
    lgb_params = {
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt', 'dart']),
        'max_depth' : trial.suggest_int('max_depth', 1, 15),
        'num_leaves' : trial.suggest_int('num_leaves', 20, 200, step=1),
        'min_child_samples' : trial.suggest_int('min_child_samples', 9, 200, step=1),
        'subsample' : trial.suggest_float("subsample", 0.4, 1.0, log=False),
        'reg_alpha' : trial.suggest_float("reg_alpha", 0.0, 15., log=False),
        'reg_lambda' : trial.suggest_float("reg_lambda", 0.0, 20., log=False),
        'colsample_bytree' : trial.suggest_float("colsample_bytree", 0.4, 1.0, log=False)
    }

    lof_params = {
        'contamination' : trial.suggest_float("contamination", 0, 0.15, log=False),
        'n_neighbors' : trial.suggest_int('n_neighbors', 5, 60)
    }
    lof = LocalOutlierFactor(
        n_jobs=1, **lof_params           
        )  
    mask = (lof.fit_predict(X_train) == 1)
    lgb = LGBMClassifier(
        objective='binary', class_weight='balanced',\
        random_state=RANDOM_STATE,\
        n_jobs=-1, importance_type='gain', **lgb_params
    )
    score = cross_val_score(lgb, X_train.loc[mask, :], y_train.loc[mask], n_jobs=1,\
                            cv=cv_split, scoring=scoring)
    
    return np.mean(score)

def lgb_learning(trial):
    lgb_params = {
        'n_estimators' : trial.suggest_int('n_estimators', 40, 300, step=1),
        'learning_rate' : trial.suggest_float('learning_rate', 0.001, 0.8)
    }

    lgb = LGBMClassifier(
        objective='binary', class_weight='balanced',\
        random_state=RANDOM_STATE,\
        n_jobs=-1, importance_type='gain', **lgb_best_params, **lgb_params\
        )
    score = cross_val_score(lgb, X_train.loc[best_mask, :], y_train.loc[best_mask], n_jobs=1,\
                            cv=cv_split, scoring=scoring)
    return np.mean(score)

In [16]:
# На практике необходимо примерно 200 итераций
n_iterations = 5
%time result = optimize(lgb_regularization, 'lgb_regularization', n_iterations)

[32m[I 2022-10-08 19:37:35,432][0m A new study created in memory with name: lgb_regularization[0m


start optimize


  self._init_valid()


  0%|          | 0/5 [00:00<?, ?it/s]

[32m[I 2022-10-08 19:37:40,849][0m Trial 0 finished with value: 0.716819865563225 and parameters: {'boosting_type': 'gbdt', 'max_depth': 12, 'num_leaves': 90, 'min_child_samples': 134, 'subsample': 0.8782683701508827, 'reg_alpha': 5.897579620308839, 'reg_lambda': 6.4394409231393475, 'colsample_bytree': 0.7439428257065328, 'contamination': 0.12003878342348835, 'n_neighbors': 59}. Best is trial 0 with value: 0.716819865563225.[0m
[32m[I 2022-10-08 19:37:43,043][0m Trial 1 finished with value: 0.7281024102105592 and parameters: {'boosting_type': 'gbdt', 'max_depth': 6, 'num_leaves': 46, 'min_child_samples': 188, 'subsample': 0.44678581970114645, 'reg_alpha': 1.903594682628128, 'reg_lambda': 2.6771650303341032, 'colsample_bytree': 0.9868076294815188, 'contamination': 0.06990564054953116, 'n_neighbors': 49}. Best is trial 1 with value: 0.7281024102105592.[0m
[32m[I 2022-10-08 19:37:45,717][0m Trial 2 finished with value: 0.7353559508415255 and parameters: {'boosting_type': 'dart', '

In [17]:
temp = dict(result.best_params)
lof_best_params = {
    'contamination': temp.pop('contamination'),
    'n_neighbors': temp.pop('n_neighbors')
}
best_lof = LocalOutlierFactor(
    n_jobs=1, **lof_best_params           
    )
best_mask = (best_lof.fit_predict(X_train) == 1)
lgb_best_params = temp

In [18]:
# На практике необходимо примерно 100 итераций
n_iterations = 5
%time result_learning = optimize(lgb_learning, 'learning_params', n_iterations)

[32m[I 2022-10-08 19:37:50,745][0m A new study created in memory with name: learning_params[0m


start optimize


  self._init_valid()


  0%|          | 0/5 [00:00<?, ?it/s]

[32m[I 2022-10-08 19:37:52,187][0m Trial 0 finished with value: 0.7270788939592893 and parameters: {'n_estimators': 64, 'learning_rate': 0.453125257123225}. Best is trial 0 with value: 0.7270788939592893.[0m
[32m[I 2022-10-08 19:37:53,505][0m Trial 1 finished with value: 0.7279643212875266 and parameters: {'n_estimators': 70, 'learning_rate': 0.012267421227043666}. Best is trial 1 with value: 0.7279643212875266.[0m
[32m[I 2022-10-08 19:37:59,358][0m Trial 2 finished with value: 0.7300684496245463 and parameters: {'n_estimators': 215, 'learning_rate': 0.17782795627338305}. Best is trial 2 with value: 0.7300684496245463.[0m
[32m[I 2022-10-08 19:38:05,015][0m Trial 3 finished with value: 0.7238213126463495 and parameters: {'n_estimators': 212, 'learning_rate': 0.46738303178156004}. Best is trial 2 with value: 0.7300684496245463.[0m
[32m[I 2022-10-08 19:38:10,470][0m Trial 4 finished with value: 0.718901480377301 and parameters: {'n_estimators': 212, 'learning_rate': 0.660005

In [19]:
lgb_best_params.update(result_learning.best_params)

In [20]:
best_model = LGBMClassifier(
    objective='binary', class_weight='balanced',\
    random_state=17,\
    n_jobs=-1, importance_type='gain', **lgb_best_params
)
cv_split = StratifiedKFold(10, shuffle=True, random_state=RANDOM_STATE)
best_lof = LocalOutlierFactor(**lof_best_params, n_jobs=-1)
best_mask = clean(X_train, best_lof)
cv_results = validate(best_model, X_train.loc[best_mask, :], y_train.loc[best_mask], cv_split, RANDOM_STATE)

0.06%, 382 штук выбросов
test:  0.7300684496245463 0.03908629799163949
train:  0.8028045914472616 0.002834211581579805
diff:  0.07273614182271516 0.040870152366999195


In [21]:
best_params = {
    'lof_params' : lof_best_params,
    'lgb_params' : lgb_best_params
}

# with open(BEST_PARAMS_FILE_NAME, 'w') as f:
#     json.dump(best_params, f)
# !git add $BEST_PARAMS_FILE_NAME
# !git commit -m 'updated tuned params'
# !git push origin

# Визуализация подбора

In [22]:
optuna.visualization.plot_optimization_history(result)

In [23]:
optuna.visualization.plot_parallel_coordinate(result)

In [24]:
optuna.visualization.plot_slice(result)