<a href="https://colab.research.google.com/github/iamzager/Karelia_churn/blob/cleaning/Karelia_main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
TOKEN = ''
URL = f'https://iamzager:{TOKEN}@github.com/iamzager/Karelia_churn.git'

In [None]:
!git init
!git pull $URL
# !git config --global user.email ''
!git config --global user.name 'iamzager'
!git remote add origin $URL

Initialized empty Git repository in /content/.git/
remote: Enumerating objects: 268, done.[K
remote: Counting objects: 100% (268/268), done.[K
remote: Compressing objects: 100% (202/202), done.[K
remote: Total 268 (delta 95), reused 200 (delta 64), pack-reused 0[K
Receiving objects: 100% (268/268), 113.14 MiB | 20.74 MiB/s, done.
Resolving deltas: 100% (95/95), done.
From https://github.com/iamzager/Karelia_churn
 * branch            cleaning   -> FETCH_HEAD


In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold, train_test_split
from sklearn.metrics import recall_score
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import LocalOutlierFactor
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from lightgbm import LGBMClassifier

import gc
import json

In [None]:
train_df = pd.read_csv('train.csv').set_index('contract_id')
log = pd.read_csv('support_log.csv', parse_dates=['event_date'])
type_contract = pd.read_csv('type_contract.csv').drop_duplicates('contract_id')
sample = pd.read_csv('sample_solution.csv')
sample['blocked'] = np.nan
competitors = pd.read_csv('dns_log.csv', parse_dates=['date'], index_col=0).reset_index(drop=True)
competitors['date'] = competitors['date'].dt.date
events = pd.concat(
    [
        competitors[['date', 'url', 'contract_id']],
        log[['event_date', 'event_type', 'contract_id']].rename({'event_date':'date'},axis=1)
    ], axis=0
    )
events['event'] =  events['event_type'].fillna(events['url'])
events = events.drop(['url', 'event_type'], axis=1).sort_values(by=['contract_id', 'date'])

In [None]:
RANDOM_STATE = 17
PREDS_FILE_NAME = 'final_predictions.csv'
BEST_PARAMS_FILE_NAME = 'best_params.json'
META_BEST_PARAMS_FILE_NAME = 'meta_best_params.json'

- Гиперпараметры подбираются в файле Karelia_tuning

In [None]:
# !git pull origin
with open(BEST_PARAMS_FILE_NAME) as f:
    best_params = json.load(f)
with open(META_BEST_PARAMS_FILE_NAME) as f:
    meta_best_params = json.load(f)

In [None]:
MODEL_BEST_PARAMS = best_params['lgb_params']
LOF_BEST_PARAMS = best_params['lof_params']

META_VEC_PARAMS = meta_best_params['vec_params']
META_ESTIMATOR_PARAMS = meta_best_params['sgd_params']

PCA_VEC_PARAMS = {
    'ngram_range' : (1,2),
    'tokenizer' : lambda s: s.split('__'),
    'max_features' : None
}        
PCA_PARAMS = {
    'svd_solver' : 'randomized',
    'random_state' : RANDOM_STATE
}

- Помимо пизнаков, выделенных при разведочном анализе, действия пользователей (events) кодируются с помощью Tf-idf
- Данные отсортированы так, чтобы n-граммы представляли собой n действий, выполненных **подряд** в определенном порядке. Таким образом, изучаются характерные цепочки действий, паттерны в поведении пользователей
- Векторизованные действия представлены в итоговых признаках двумя способами:
    - Сжимаются PCA. Первые 12 компонент используются как признаки pca_i
    - Передаются логисической регрессии (SGDClassifier(loss='log')). <br> 
        В качестве признака **meta** используется прогноз вероятности целевого класса. <br>
        Для избежания переобучения реализован классический стекинг с делением на 30 фолдов
- Выбросы убираются с помощью LocalOutlierFactor

In [None]:
def clean(X, estimator):    
    mask = (estimator.fit_predict(X) == 1)
    print(f'{round(1 - (mask.sum() / mask.shape[0]), 2)}%, {mask.shape[0] - mask.sum()} штук выбросов')
    return mask

def add_payment_features(X):
    X = pd.merge(X, type_contract, on='contract_id', how='left')\
        .set_index('contract_id')
    X['day_or_month_contract'] = X['day_or_month_contract'].fillna(0).astype('category')
    return X

def add_url_features(X, competitors):
    # Число днс запросов
    id_to_url = competitors.groupby('contract_id')[['rt', 'sampo']].sum()
    id_to_url['n_urls_log'] = id_to_url.eval('rt + sampo')
    X = pd.merge(X, np.log1p(id_to_url), on='contract_id', how='left')\
        .fillna(0)\
        .rename({'rt':'rt_log', 'sampo':'sampo_log'}, axis=1)

    # Число запросов за прошлые периоды
    train_contracts = X.index.unique()    
    comp_max_date = competitors.query('contract_id in @train_contracts')['date'].max()
    for week_offset in [1]:
        date_limit = comp_max_date - pd.Timedelta(days=week_offset*5)
        counts = competitors.query('date >= @date_limit')\
            .groupby('contract_id')['date'].count()\
            .rename(f'n_urls_{week_offset}_log')
        X = pd.merge(X, counts, on='contract_id', how='left')
        X[f'n_urls_{week_offset}_log'] = np.log1p(X[f'n_urls_{week_offset}_log'].fillna(0))

    # Наличие днс запросов
    X['has_urls'] = ((X['n_urls_log'] > 0) * 1).astype('category')

    # Много запросов
    X['has_many_urls'] = ((np.expm1(X['n_urls_log']) > 300) * 1).astype('category')

    # Наличие запросов к обоим конкурентам
    X['has_both_competitors'] = (X.eval('(rt_log > 0) & (sampo_log > 0)') * 1).astype('category')
    X = X.drop(['sampo_log'], axis=1)

    # Наличие запросов за прошлые периоды
    for week_offset in [1]: 
        X[f'has_urls_{week_offset}'] = ((X[f'n_urls_{week_offset}_log'] > 0) * 1).astype('category')

    # Число дней с прошлого запроса
    days_from_last_url = np.log1p(
        (
            comp_max_date - competitors.groupby('contract_id')['date'].max()
        ).dt.days.rename('days_from_last_url_log')
    )
    X = pd.merge(X, days_from_last_url, on='contract_id', how='left')
    X['days_from_last_url_log'] = X['days_from_last_url_log']\
        .fillna(X['days_from_last_url_log'].max())

    # Число дней с первого запроса
    days_from_first_url = (
        comp_max_date - competitors.groupby('contract_id')['date'].min()
    ).dt.days.rename('days_from_first_url')
    X = pd.merge(X, days_from_first_url, on='contract_id', how='left')
    X['days_from_first_url'] = X['days_from_first_url'].fillna(0)

    # Наличие запросов к самым популярным и наименее популярным адресам
    url_counts = competitors.groupby('url')['url']\
        .count()\
        .sort_values(ascending=False)
    top_5_urls = set(url_counts.head(5).index.values)
    bottom_urls = set(url_counts[url_counts <= 3].index.values)
    X['has_top_urls'] = competitors.groupby('contract_id')['url']\
        .apply(lambda x : (set(x) & top_5_urls) != set()) * 1
    X['has_bottom_urls'] = competitors.groupby('contract_id')['url']\
        .apply(lambda x : (set(x) & bottom_urls) != set()) * 1
    X[['has_top_urls', 'has_bottom_urls']] = X[['has_top_urls', 'has_bottom_urls']].fillna(0).astype('category')

    del id_to_url, counts, days_from_last_url, days_from_first_url,\
        top_5_urls, bottom_urls
    gc.collect()
    return X

def add_support_features(X, support_log):
    # Число обращений
    X = pd.merge(
        X,
        support_log.groupby('contract_id')['event_date'].count(), on='contract_id', how='left'
        ).rename({'event_date':'n_requests_log'}, axis=1)
    X['n_requests_log'] = np.log1p(X['n_requests_log'].fillna(0))

    # Число обращений за прошлые периоды
    train_contracts = X.index.unique()    
    log_max_date = support_log.query('contract_id in @train_contracts')['event_date'].max()
    for week_offset in [1]:
        date_limit = log_max_date - pd.Timedelta(days=week_offset*5)
        counts = support_log.query('event_date >= @date_limit')\
            .groupby('contract_id')['event_date'].count()
        X = pd.merge(X, counts, on='contract_id', how='left')\
            .rename({'event_date':f'n_requests_{week_offset}_log'}, axis=1)
        X[f'n_requests_{week_offset}_log'] = np.log1p(X[f'n_requests_{week_offset}_log'].fillna(0))

    # Наличие обращений
    X['has_requests'] = ((X['n_requests_log'] > 0) * 1).astype('category')
    for week_offset in [1]: 
        X[f'has_requests_{week_offset}'] = ((X[f'n_requests_{week_offset}_log'] > 0) * 1).astype('category')

    # Много обращений
    X['has_many_requests'] = ((np.expm1(X['n_requests_log']) > 5) * 1).astype('category')

    # Дней с прошлого обращения
    days_from_last_request = (
        log_max_date - support_log.groupby('contract_id')['event_date'].max()
        ).dt.days.rename('days_from_last_request')
    days_from_last_request.name = 'days_from_last_request'
    X = pd.merge(X, days_from_last_request, on='contract_id', how='left')
    X['days_from_last_request'] = X['days_from_last_request']\
        .fillna(X['days_from_last_request'].max())\
        .astype('int')

    # Дней с первого обращения
    days_from_first_request = (
        log_max_date - support_log.groupby('contract_id')['event_date'].min()
    ).dt.days.rename('days_from_first_request')
    X = pd.merge(X, days_from_first_request, on='contract_id', how='left')
    X['days_from_first_request'] = X['days_from_first_request'].fillna(0)

    # Число типов обращений
    n_types = support_log.groupby('contract_id')['event_type'].nunique().rename('n_types')
    X = pd.merge(X, n_types, on='contract_id', how='left')
    X['n_types'] = X['n_types'].fillna(0).astype('int')
    
    # Повторяющиеся типы
    date_splits = [
        support_log['event_date'].min(),
        log_max_date - pd.Timedelta(weeks=1),
        log_max_date - pd.Timedelta(weeks=4)
        ]
    col_names = [
        'has_repeated',
        'has_repeated_last_week',
        'has_repeated_last_month'
    ]
    for col_name, date_split in zip(col_names, date_splits):  
        new_col = support_log.query('event_date >= @date_split').groupby('contract_id')['event_type'].apply(
            lambda x : ((x.shape[0] - x.nunique()) > 0) * 1
            ).rename(col_name)
        X = pd.merge(X, new_col, on='contract_id', how='left')
        X[col_name] = X[col_name].fillna(0).astype('category')

    # Наличие обращений по самым популярным и наименее популярным темам
    type_counts = support_log.groupby('event_type')['event_type']\
        .count()\
        .sort_values(ascending=False)
    top_5_types = set(type_counts.head(3).index.values)
    bottom_types = set(type_counts[type_counts <= 5].index.values)
    X['has_top_types'] = support_log.groupby('contract_id')['event_type']\
        .apply(lambda x : (set(x) & top_5_types) != set()) * 1
    X['has_bottom_types'] = support_log.groupby('contract_id')['event_type']\
        .apply(lambda x : (set(x) & bottom_types) != set()) * 1
    X[['has_top_types', 'has_bottom_types']] = X[['has_top_types', 'has_bottom_types']].fillna(0).astype('category')

    del counts, days_from_last_request, days_from_first_request, n_types,\
        date_splits, col_names, type_counts,\
        top_5_types, bottom_types
    gc.collect()
    return X

def add_meta_features(events, X, y=None, vectorizer=None, estimator=None, n_folds=2):
    index = X.index.values
    corpus = events.sort_values(by=['contract_id', 'date'])\
        .groupby('contract_id')['event']\
        .apply(lambda x : '__'.join(x))\
        .reindex(index)\
        .fillna('Nothing')
    meta = pd.Series(index=index, dtype='float', name='meta')
    if ((not vectorizer) and (not estimator) and (y is not None)):
        vectorizer = TfidfVectorizer(
            tokenizer = lambda s: s.split('__'),
            **META_VEC_PARAMS
            )
        estimator = SGDClassifier(
            loss='log',
            random_state=RANDOM_STATE,
            class_weight='balanced',
            **META_ESTIMATOR_PARAMS
        )

        events_vec = vectorizer.fit_transform(corpus)        
        np.random.seed(RANDOM_STATE)
        rand_indices = np.random.permutation(np.arange(X.shape[0]))
        s = int(X.shape[0] / n_folds)
        for i in range(n_folds):
            if i < (n_folds-1):
                transform_fold = rand_indices[s * i : s * (i+1)]
            else:
                transform_fold = rand_indices[s * i :]
            fit_fold = list(set(rand_indices) - set(transform_fold))
            estimator.fit(events_vec[fit_fold, :], y.iloc[fit_fold])  
            meta.iloc[transform_fold] =\
                estimator.predict_proba(events_vec[transform_fold, :])[:, 1].flatten()
        estimator.fit(events_vec, y)

    elif ((vectorizer) and (estimator) and (y is None)):
        events_vec = vectorizer.transform(corpus)
        meta.iloc[:] = estimator.predict_proba(events_vec)[:, 1].flatten()

    X = pd.merge(X, meta, how='left', left_on='contract_id', right_index=True)
    del meta, corpus, events_vec
    gc.collect()
    return X, vectorizer, estimator

def add_pca_features(events, X, y=None, n_components=0.95, vectorizer=None, estimator=None):
    index = X.index.values
    corpus = events.sort_values(by=['contract_id', 'date'])\
        .groupby('contract_id')['event']\
        .apply(lambda x : '__'.join(x))\
        .reindex(index)\
        .fillna('Nothing')

    if ((not vectorizer) and (not estimator)):
        vectorizer = TfidfVectorizer(**PCA_VEC_PARAMS)
        events_vec = vectorizer.fit_transform(corpus)
        estimator = PCA(**PCA_PARAMS, n_components=n_components)
        events_pca = estimator.fit_transform(events_vec.toarray())

    elif ((vectorizer) and (estimator)):
        events_vec = vectorizer.transform(corpus)
        events_pca = estimator.transform(events_vec.toarray())

    events_pca = pd.DataFrame(
        events_pca,\
        index=index,\
        columns=[f'pca_{i+1}' for i in range(estimator.n_components_)]
        )
    X = pd.merge(X, events_pca, how='left', left_on='contract_id', right_index=True)

    del corpus, events_vec, events_pca
    gc.collect()
    return X, vectorizer, estimator
    
def scoring(estimator, X, y_true):
  preds = estimator.predict(X)
  return recall_score(y_true, preds, average='macro')

def validate(model, X, y, cv, random_state=RANDOM_STATE):    
    cv_results = cross_validate(
        model,
        X,
        y,
        cv=cv,
        n_jobs=-1,
        scoring=scoring,
        return_train_score=True
        )
    print('test: ', cv_results['test_score'].mean(), cv_results['test_score'].std())
    print('train: ', cv_results['train_score'].mean(), cv_results['train_score'].std())
    print('diff: ',
        (cv_results['train_score'] - cv_results['test_score']).mean(),
        (cv_results['train_score'] - cv_results['test_score']).std()
        )
    return cv_results['test_score']

# Валидация

In [None]:
y_train = train_df['blocked']

In [None]:
X_train = add_payment_features(train_df.drop('blocked', axis=1))
X_train = add_url_features(X_train, competitors)
X_train = add_support_features(X_train, log)
X_train, vectorizer_meta, estimator_meta = add_meta_features(events, X_train, train_df['blocked'], n_folds=30)
X_train, vectorizer_pca, estimator_pca = add_pca_features(events, X_train, train_df['blocked'], 12)

In [None]:
# Параметры по умолчанию
model = LGBMClassifier(
    objective='binary', class_weight='balanced',\
    random_state=RANDOM_STATE,\
    n_jobs=-1, importance_type='gain'
)
lof = LocalOutlierFactor(n_jobs=-1)
mask = clean(X_train, lof)
cv_split = StratifiedKFold(10, shuffle=True, random_state=RANDOM_STATE)
validate(model, X_train.loc[mask, :], y_train.loc[mask], cv_split, RANDOM_STATE)

0.07%, 427 штук выбросов
test:  0.6911667434530583 0.03401115590689564
train:  0.9087515663419602 0.003728271459332209
diff:  0.21758482288890185 0.03600532844189617


array([0.67971919, 0.6454266 , 0.69057429, 0.70518509, 0.69117647,
       0.68196014, 0.67808043, 0.78347307, 0.6816565 , 0.67441565])

- Файл с признаками сохраняется для подбора гиперпараметров

In [None]:
# pd.concat([X_train, y_train], axis=1).to_csv('X_for_tuning.csv')
# !git add 'X_for_tuning.csv'
# !git commit -m 'updated features for tuning'
# !git push origin 

[master 92c8dfa] updated features for tuning
 1 file changed, 5992 insertions(+), 5992 deletions(-)
Counting objects: 3, done.
Delta compression using up to 2 threads.
Compressing objects: 100% (3/3), done.
Writing objects: 100% (3/3), 79.29 KiB | 1.47 MiB/s, done.
Total 3 (delta 2), reused 0 (delta 0)
remote: Resolving deltas: 100% (2/2), completed with 2 local objects.[K
remote: 
remote: Create a pull request for 'master' on GitHub by visiting:[K
remote:      https://github.com/iamzager/Karelia_churn/pull/new/master[K
remote: 
To https://github.com/iamzager/Karelia_churn.git
 * [new branch]      master -> master


In [None]:
best_model = LGBMClassifier(
    objective='binary', class_weight='balanced',\
    random_state=RANDOM_STATE,\
    n_jobs=-1, importance_type='gain', **MODEL_BEST_PARAMS
)

In [None]:
# Параметры после тюнинга
cv_split = StratifiedKFold(10, shuffle=True, random_state=RANDOM_STATE)
best_lof = LocalOutlierFactor(**LOF_BEST_PARAMS, n_jobs=-1)
best_mask = clean(X_train, best_lof)
cv_results = validate(best_model, X_train.loc[best_mask, :], y_train.loc[best_mask], cv_split, RANDOM_STATE)

0.05%, 278 штук выбросов
test:  0.7520706961421089 0.027369368182338046
train:  0.7881648271101672 0.0027332833881615823
diff:  0.03609413096805832 0.02929964836663476


# Прогноз

In [None]:
X_test = add_payment_features(sample.set_index('contract_id').drop('blocked', axis=1))
X_test = add_url_features(X_test, competitors)
X_test = add_support_features(X_test, log)
X_test, _, _ = add_meta_features(events, X_test, None, vectorizer_meta, estimator_meta)
X_test, _, _ = add_pca_features(events, X_test, None, None, vectorizer_pca, estimator_pca)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [None]:
best_model.fit(X_train.loc[best_mask, :], y_train.loc[best_mask])
preds = pd.Series(best_model.predict(X_test), index=X_test.index)
preds = pd.merge(sample.drop('blocked', axis=1), preds.rename('blocked'), on='contract_id')
preds.to_csv(PREDS_FILE_NAME, index=False)

In [None]:
# !git add $PREDS_FILE_NAME
# !git commit -m 'updated predictions'
# !git push origin