## Импорт библиотек

In [2]:
%matplotlib inline
import numpy as np
import pandas as pd; pd.set_option('display.max_columns', None)
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV

# Инструкция по установке пакета: https://github.com/maks-sh/scikit-uplift
# Инстркция на документацию: https://scikit-uplift.readthedocs.io/en/latest/
from sklift.metrics import uplift_at_k
from sklift.viz import plot_uplift_preds
from sklift.models import SoloModel
from sklift.models import ClassTransformation

# sklift поддерживает любые модели, 
# которые удовлетворяют соглашениями scikit-learn
# Для примера воспользуемся catboost
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

from sklift.preprocess import balancer
from datetime import datetime
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import GridSearchCV

## Получение выборки

In [3]:
PATH_TO_DATA = 'C:/Users/Anatoly/1python/X5retail/heroX5-master/data/'
# Чтение данных
# Выбросим последнего пользователя
df_clients = pd.read_csv(PATH_TO_DATA+ 'clients.csv', index_col='client_id').iloc[:-1,:]
df_train = pd.read_csv(PATH_TO_DATA+ 'uplift_train.csv', index_col='client_id').iloc[:-1,:]
df_test = pd.read_csv(PATH_TO_DATA+ 'uplift_test.csv', index_col='client_id')

# Извлечение признаков
df_features = df_clients.copy()

cat_features =['gender']
    
models_results = {
    'approach': [],
    'uplift@30%': []
}


In [4]:
def delay_time(data):
    '''
    Calculate delay in seconds/10*6 beetween datetime dataseries.
    Return finish dataset.
    
    Args:
        data(Dataframe): data with columns 'first_issue_date', 'first_redeem_date'
        
    '''
    
    issue_date = data['first_issue_date'] 
    redeem_date = data['first_redeem_date'] 
    
    redeem_time = pd.to_datetime(redeem_date)
    issue_time = pd.to_datetime(issue_date)
    
    start_time = datetime(2000,1,1)
    delay = []
    issue_sec = []
    redeem_sec = []
    for redeem,issue in zip(redeem_time, issue_time):
        # if x not None
        issue_sec.append((issue - start_time).seconds)
        
        if redeem==redeem:
            redeem_sec.append((redeem - start_time).seconds)
            delay.append((redeem - issue).seconds/10**4)
        else:
            delay.append(0)
            redeem_sec.append(0)
            
    
    data['issue_redeem_delay'] = np.array(delay)
    data['first_issue_time'] = np.array(issue_sec)
    data['first_redeem_time'] = np.array(redeem_sec) 
    
    data = data.drop(['first_issue_date', 'first_redeem_date'], axis=1)
    
    return data
        

In [5]:
%%time
df_features = delay_time(df_features)

Wall time: 16.4 s


In [6]:
# Соединяем с моими признаками
df_my = pd.read_csv(PATH_TO_DATA + 'all_users.csv',index_col='client_id')
df_features = df_features.join(df_my)

In [7]:
def initialize_train_test(df_train, df_test, data, estimator=None):
    '''
    Split data to train and validation.
    Return X_train, y_train, treat_train; X_val, y_val, treat_val; X_train_full, y_train_full, treat_train_full, data
    
    Args: 
        df_train: train data with treatment and target
        df_test: trest data with treatment and target
        data_feats: data with features
        estimator(str): estimator that will train
    '''

    data_feats = data.copy()
    cat_features = ['gender']
    # One-hot encoding for estimators which are not catboost
    if estimator!='catboost':
        data_feats['F'] = (data_feats.gender=='F').astype(int)
        data_feats['M'] = (data_feats.gender=='M').astype(int)
        data_feats['U'] = (data_feats.gender=='U').astype(int)
        cat_features = ['F','M','U']
        data_feats = data_feats.drop(columns='gender')
    
    # Присваеваем индексы трейн и тест, обучения и валидации
    indices_train = df_train.index
    indices_test = df_test.index
    indices_learn, indices_valid = train_test_split(df_train.index, test_size=0.3, random_state=123) #, stratify=df_train.treatment_flg)


    # Обучающая выборка
    X_train = data_feats.loc[indices_learn, :]
    y_train = df_train.loc[indices_learn, 'target']
    treat_train = df_train.loc[indices_learn, 'treatment_flg']

    # Валидационная выборка
    X_val = data_feats.loc[indices_valid, :]
    y_val = df_train.loc[indices_valid, 'target']
    treat_val =  df_train.loc[indices_valid, 'treatment_flg']

    # Полные данные
    X_train_full = data_feats.loc[indices_train, :]
    y_train_full = df_train.loc[:, 'target']
    treat_train_full = df_train.loc[:, 'treatment_flg']

    X_test = data_feats.loc[indices_test, :]
    
    return X_train, y_train, treat_train,\
           X_val, y_val, treat_val,\
           X_train_full, y_train_full, treat_train_full,\
           cat_features, data_feats, X_test

# Standartization

In [8]:
# Не помогло
def scale_data(data):
    
    scaler = StandardScaler()
    scaler.fit(data.iloc[:,2:])
    
    scaled_data = pd.DataFrame(scaler.transform(data.iloc[:,2:]), columns=data.iloc[:,2:].columns, index=data.index)
    scaled_data = pd.concat([data.iloc[:,:2],scaled_data], axis=1)
    
    return scaled_data

scaled_data = scale_data(df_features)

NameError: name 'StandardScaler' is not defined

# Auto

In [9]:
def update_table(estimator, model, score, balance, models_results):
    estimator_name = str(estimator).replace("'","").replace('<','').replace('>','').split('.')[-1]
    approach_name = str(model).replace("'","").replace('<','').replace('>','').split('.')[-1]

    models_results['approach'].append(approach_name)
    models_results['balance'].append(balance)
    models_results['estimator'].append(estimator_name)
    models_results['uplift@30%'].append(score)
    
    return models_results

In [10]:
def choose_model(estimators, models, data_feat, df_train, df_test):
    '''
    Create table with results of training model.
    
    Args:
        estimator:  classifier
        model: model of uplift (SoloModel, ClassTransformation)
        data_feat : training and validation data with features
        df_train : train data with target and treatment
        df_test : test data with target and treatment
    '''
    
        
    models_results = {
    'approach': [],
    'balance': [],
    'estimator': [],
    'uplift@30%': []
    }
    
    
    df_features = data_feat.copy()
    
    for estimator in estimators:
        for model in models:
            for i, balance in enumerate(['balanced','not balanced']):
                # Initialization 
                if estimator==CatBoostClassifier:
                    X_train, y_train, treat_train,\
                    X_val, y_val, treat_val,\
                    X_train_full, y_train_full, treat_train_full,\
                    cat_features, df_features_est, X_test = initialize_train_test(df_train, df_test, df_features, 'catboost')

                    if balance=='balanced':
                        X_train, treat_train, y_train = balancer(X_train, treat_train, y_train, random_state=0, verbose=False)
                        X_train = pd.DataFrame(X_train, columns = df_features_est.columns)

                if estimator!=CatBoostClassifier:
                    X_train, y_train, treat_train,\
                    X_val, y_val, treat_val,\
                    X_train_full, y_train_full, treat_train_full,\
                    cat_features, df_features_est, X_test = initialize_train_test(df_train, df_test, df_features)

                    if balance=='balanced':
                        X_train, treat_train, y_train = balancer(X_train, treat_train, y_train, random_state=0, verbose=False)
                        X_train = pd.DataFrame(X_train, columns = df_features_est.columns)

                # Estimation
                if estimator==CatBoostClassifier:
                    uplift_model = model(estimator(thread_count=2, random_state=42, silent=True))
                    uplift_model = uplift_model.fit(X_train, y_train, treat_train, estimator_fit_params={'cat_features': cat_features})

                if estimator!=CatBoostClassifier:
                    uplift_model = model(estimator(random_state=42))
                    uplift_model = uplift_model.fit(X_train, y_train, treat_train)

                uplift_model_predict = uplift_model.predict(X_val)
                uplift_score = uplift_at_k(y_true=y_val, uplift=uplift_model_predict, treatment=treat_val, k=0.3)
                models_results = update_table(estimator, model, uplift_score, balance, models_results)
    
    return pd.DataFrame(data=models_results).sort_values('uplift@30%', ascending=False)
        
    

In [11]:
estimators = [CatBoostClassifier, XGBClassifier, RandomForestClassifier, DecisionTreeClassifier]
models = [SoloModel, ClassTransformation]

In [73]:
%%time
models_results = choose_model(estimators, models, df_features, df_train, df_test)



Wall time: 10min 24s


In [74]:
models_results

Unnamed: 0,approach,balance,estimator,uplift@30%
5,SoloModel,not balanced,XGBClassifier,0.063509
4,SoloModel,balanced,XGBClassifier,0.061864
0,SoloModel,balanced,CatBoostClassifier,0.060889
1,SoloModel,not balanced,CatBoostClassifier,0.054664
6,ClassTransformation,balanced,XGBClassifier,0.053876
3,ClassTransformation,not balanced,CatBoostClassifier,0.050547
7,ClassTransformation,not balanced,XGBClassifier,0.047768
2,ClassTransformation,balanced,CatBoostClassifier,0.047319
15,ClassTransformation,not balanced,DecisionTreeClassifier,0.043689
12,SoloModel,balanced,DecisionTreeClassifier,0.041901
