### - join full data
### - get cv results
### - Validate top features number as parameter
### - Validate model params

In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
import dill
import sys
import re
import pandas as pd
import numpy as np
import gc
import warnings
  
from dstools.spark import init_spark2, pandify2

from dstools.ml import yandex_mean_encoder

from sklearn.model_selection import train_test_split

from sklift.models import SoloModel, ClassTransformation, TwoModels
from sklift.preprocess import balancer
from sklift.metrics import uplift_at_k

from causalml.inference.tree import UpliftTreeClassifier, UpliftRandomForestClassifier
from causalml.inference.tree import uplift_tree_string, uplift_tree_plot 

from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

warnings.filterwarnings('ignore')

In [3]:
sys.path.append('./scripts/')
from add_functions import catb_get_feature_imp, feat_imp, make_validation

In [3]:
spark = init_spark2("ret_modeling_1_2")

-------

#### LOAD DATA

In [None]:
data_path = '/user/kvliksak/retailhero'

df_train = spark.read.csv(
    os.path.join(data_path, 'uplift_train.csv'), 
    inferSchema=True, header=True
).toPandas()

df_test = spark.read.csv(
    os.path.join(data_path, 'uplift_test.csv'), 
    inferSchema=True, header=True
).toPandas()

In [92]:
sdf_feats = spark.table('ntwk_sb.fix_price_all_geo_events_delete_me_pls')

df_feats = pandify2(
    sdf_feats.drop(
        'first_issue_date', 
        'first_redeem_date', 
        'cl_transaction_datetime_min',
        'cl_transaction_datetime_max',
        ), 
    cast_overrides={
        'cl_top_lvl_1_cnt': 'float',
        'cl_top_lvl_2_cnt': 'float',
        'cl_top_lvl_3_cnt': 'float',
        'cl_top_lvl_4_cnt': 'float',
        'cl_top_prod_cnt': 'float'
    }).toPandas()\
    .set_index('client_id')

    
print(df_feats.shape)

df_feats.columns =\
    ['m' + '_' + f for f in df_feats.columns.tolist()]

(400162, 77)


In [8]:
all_feats_new = df_feats.columns.tolist()

cat_feats_new = df_feats\
    .select_dtypes(include='object').columns.tolist()
    
print(f'New feats cnt: {len(all_feats_new)}, new cat feats cnt: {len(cat_feats_new)}')
# New feats cnt: 59, new cat feats cnt: 17

New feats cnt: 77, new cat feats cnt: 23


In [10]:
# read all data

cat_client_cols = [
    'cl_first_issue_date_weekday', 'cl_first_redeem_date_weekday',
    'cl_first_issue_date_hour', 'cl_first_redeem_date_hour'
]

# object - type feats
cl_dtypes = {
    feat: 'object' for feat in cat_client_cols
}

df_full = pd.read_csv(
    'data/full_features_df.csv', 
    index_col='client_id',
    dtype=cl_dtypes
)

In [11]:
df_baseline_new_tr = pd.read_csv('data/new_baseline_tr.csv', index_col=0)
df_baseline_new_ts = pd.read_csv('data/new_baseline_ts.csv', index_col=0)

df_baseline_new_tr = df_baseline_new_tr\
    .drop(['treatment_flg', 'target'], axis=1)

df_baseline_new_tr.columns =\
    [ 'ex' + '_' + f for f in df_baseline_new_tr.columns.tolist()]
    
df_baseline_new_ts.columns =\
    [ 'ex' + '_' + f for f in df_baseline_new_ts.columns.tolist()]
    
df_baseline_new_tr = df_baseline_new_tr\
    .reset_index()\
    .rename(columns={'index': 'client_id'})
    
df_baseline_new_ts = df_baseline_new_ts\
    .reset_index()\
    .rename(columns={'index': 'client_id'})

In [93]:
df_full_mod = pd.concat([
    df_full, df_feats_maks
], axis=1)\
    .reset_index()\
    .rename(columns={
        'index': 'client_id'
    })
    
df_full_mod.shape
# (400162, 310)

(400162, 328)

In [94]:
df_full_train = pd.merge(
    df_full_mod, df_train,
    on='client_id', how='right'
)

df_full_train = pd.merge(
    df_full_train, df_baseline_new_tr,
    on='client_id', how='left'
)

print(df_full_train.shape)
# (200039, 312)
# (200039, 462)

(200039, 479)


In [95]:
df_full_test = pd.merge(
    df_full_mod, df_test,
    on='client_id', how='right'
)

df_full_test = pd.merge(
    df_full_test, df_baseline_new_ts,
    on='client_id', how='left'
)

print(df_full_test.shape)
# (200123, 310)
# (200123, 460)

(200123, 477)


In [16]:
# FILL CAT COLS WITH NA -> 'NA'
cat_cols_na = [
    'cl_first_redeem_date_hour',
    'cl_first_redeem_date_weekday']

df_full_train.loc[:, cat_cols_na] = df_full_train[cat_cols_na].fillna('NA') 
df_full_test.loc[:, cat_cols_na] = df_full_test[cat_cols_na].fillna('NA')

In [187]:
df_full_train.loc[:, cat_cols] = df_full_train[cat_cols].fillna('NA') 
df_full_test.loc[:, cat_cols] = df_full_test[cat_cols].fillna('NA')

#### MODELING

In [103]:
non_feats = ['client_id', 'treatment_flg', 'target']

all_feats = list(
    set(df_full_train.columns.tolist()) -\
    set(non_feats)
)

cat_cols = list(
    set(df_full_train.select_dtypes(include='object').columns) -\
    set(non_feats)
)

print(f'All feats cnt: {len(all_feats)}')
print(f'Cat feats cnt: {len(cat_cols)}')

All feats cnt: 476
Cat feats cnt: 31


In [288]:
id_cols = ['client_id', 'target', 'treatment_flg']

try_feat =\
    [f for f in df_full_train.columns.tolist() 
     if re.match('ex_', f)]

len(try_feat)
# 59

149

In [183]:
all_feats = df_full_train.columns.tolist() 
try_feat = list(
    set(all_feats) -\
    set(id_cols)
)

len(try_feat)

476

In [209]:
with open('top_all_feats.txt', 'r') as f_in:
    top_feats = f_in.readline().split(';')

In [211]:
n_top = 10
try_feat = top_feats[:n_top]

In [284]:
### SET PIPELINES:
## PREPARE DATA FOR XGB / LGBM MODEL, fit transform target_encodet on train data

enc = yandex_mean_encoder(
    columns=list(set(cat_cols).intersection(set(try_feat))), 
    alpha=100
)

xgb_est_params = {
    'max_depth':2,
    'learning_rate': 0.2, 
    'n_estimators': 100,
    
   # 'min_child_weight': 5,
   # 'subsample': 0.6,
    'lambda': 1,
    'alpha': 0,
    
    'booster': 'dart',
    
    'nthread':40,
    'n_gpus':0,
    'seed':10023
}

estimator = XGBClassifier(**xgb_est_params)

uplift_model_cl_tr = ClassTransformation(
    estimator=estimator
)

xgb_pipeline = Pipeline(steps=[
    ('enc', enc),
    ('est', uplift_model_cl_tr)
])


In [278]:
catb_est_params = {
    'depth': 1,
    'l2_leaf_reg': 45,
    'iterations': 1000,
    'verbose': False, 
    'random_state': 1000, 
    'task_type': 'GPU',
    'devices': '1'
}

estimator_catb = CatBoostClassifier(
    cat_features=list(
        set(cat_cols)\
        .intersection(set(try_feat))
    ), **catb_est_params
)

uplift_model_cl_tr_catb = ClassTransformation(
    estimator=estimator_catb
)

In [184]:
estimator_catb = CatBoostClassifier(
    cat_features=list(set(cat_cols).intersection(set(try_feat))),
    verbose=200, random_state=42, task_type='GPU', devices='1'
)

uplift_model_cl_tr2 = ClassTransformation(
    estimator=estimator_catb
)

In [289]:
lgbm_params = {
    'learning_rate':0.03,
    'max_depth':2,
    'n_estimators': 200,
    'num_leaves':20,
    'min_data_in_leaf':3,
    'application':'binary',
    'subsample':0.5, 
    'colsample_bytree': 0.8,
    'reg_alpha':0.05,
    'data_random_seed':42,
    'metric':'binary_logloss',
    'max_bin':416,
    'bagging_freq':3,
    'reg_lambda':0.05,
    'num_leaves':20,
    'nthread': 20,
    'seed': 42
    }

enc = yandex_mean_encoder(
    columns=list(set(cat_cols).intersection(set(try_feat))), 
    alpha=100
)

estimator_lgbm = LGBMClassifier(**lgbm_params, verbose=200)

uplift_model_cl_tr_lgbm = ClassTransformation(
    estimator=estimator_lgbm
)

lgbm_pipeline = Pipeline(steps=[
    ('enc', enc),
    ('est', uplift_model_cl_tr_lgbm)
])

In [218]:
rf_params = {
    'max_depth': 4,
    'n_jobs': 40
}

estimator_rf = RandomForestClassifier(
    **rf_params
)

enc = yandex_mean_encoder(
    columns=list(set(cat_cols).intersection(set(try_feat))), 
    alpha=100
)

uplift_model_cl_tr_rf = ClassTransformation(
    estimator=estimator_rf
)

rf_pipeline = Pipeline(steps=[
    ('enc', enc),
    ('est', uplift_model_cl_tr_rf)
])

In [167]:

enc = yandex_mean_encoder(
    columns=list(set(cat_cols).intersection(set(try_feat))), 
    alpha=100
)

uplift_model_cml = UpliftRandomForestClassifier(
    n_estimators=10,
    max_depth=4, 
    min_samples_leaf=200, 
    min_samples_treatment=50, 
    n_reg=100, 
    evaluationFunction='KL', 
    control_name='0'
)

cm_pipeline = Pipeline(steps=[
    ('enc', enc),
    ('est', uplift_model)
])

----

#### VALIDATE

In [256]:
try_feat = top_feats[:120]

In [None]:
val_res_catb = make_validation(
    df_full=df_full_train,
    upift_model=uplift_model_cl_tr_lgbm,
    try_feat=try_feat,
    pipeline_flg=False,
    n_iter=15
)

In [None]:
val_res_catb = make_validation(
    df_full=df_full_train,
    upift_model=xgb_pipeline,
    try_feat=try_feat,
    pipeline_flg=True,
    n_iter=15
)

In [None]:
out_res = pd.DataFrame()
cnt_top_feats = [20, 30, 50, 70, 100]

for n_top in cnt_top_feats:
    
    print(f'Top feats: {n_top}')
    
    try_feat = top_feats[:n_top]
    
    enc = yandex_mean_encoder(
        columns=list(set(cat_cols)\
                    .intersection(set(try_feat))), 
        alpha=100
    )
    
    estimator_catb = CatBoostClassifier(
        cat_features=list(
            set(cat_cols)\
            .intersection(set(try_feat))
        ), **catb_est_params
    )

    uplift_model_cl_tr_catb = ClassTransformation(
        estimator=estimator_catb
    )
    
    xgb_pipeline = Pipeline(steps=[
        ('enc', enc),
        ('est', uplift_model_cl_tr)
    ])
    
    lgbm_pipeline = Pipeline(steps=[
        ('enc', enc),
        ('est', uplift_model_cl_tr_lgbm)
    ])
    
    rf_pipeline = Pipeline(steps=[
        ('enc', enc),
        ('est', uplift_model_cl_tr_rf)
    ])
    
    val_res_catb = make_validation(
        df_full=df_full_train,
        upift_model=uplift_model_cl_tr_catb,
        try_feat=try_feat,
        pipeline_flg=False,
        n_iter=5
    )
    
    val_res_xgb = make_validation(
        df_full=df_full_train,
        upift_model=xgb_pipeline,
        try_feat=try_feat,
        pipeline_flg=True,
        n_iter=5
    )
    
    val_res_lgbm = make_validation(
        df_full=df_full_train,
        upift_model=lgbm_pipeline,
        try_feat=try_feat,
        pipeline_flg=True,
        n_iter=5
    )
    
    val_res_catb_df =\
    pd.DataFrame(val_res_catb).T
    
    val_res_xgb_df =\
    pd.DataFrame(val_res_xgb).T
    
    val_res_lgbm_df =\
    pd.DataFrame(val_res_lgbm).T
    
#     val_res_rf_df =\
#     pd.DataFrame(val_res_rf).T
    
    out_res.append({
        'n_top': n_top,
        'score_catb': val_res_catb_df['score_val'].mean(),
        'score_xgb': val_res_xgb_df['score_val'].mean(),
        'score_lgbm': val_res_lgbm_df['score_val'].mean()
    }, ignore_index=True)
    
    print(f"VAL MEAN: {val_res_catb_df['score_val'].mean():.5f} +- {val_res_catb_df['score_val'].std():.5f}")
    print(f"VAL MEAN: {val_res_xgb_df['score_val'].mean():.5f} +- {val_res_xgb_df['score_val'].std():.5f}")
    print(f"VAL MEAN: {val_res_lgbm_df['score_val'].mean():.5f} +- {val_res_lgbm_df['score_val'].std():.5f}")

In [253]:
out_res = pd.DataFrame()
cnt_top_feats = [100, 110, 120, 130, 140]

for n_top in cnt_top_feats:
    
    print(f'Top feats: {n_top}')
    
    try_feat = top_feats[:n_top]
        
    estimator_catb = CatBoostClassifier(
        cat_features=list(
            set(cat_cols)\
            .intersection(set(try_feat))
        ), **catb_est_params
    )

    uplift_model_cl_tr_catb = ClassTransformation(
        estimator=estimator_catb
    )
    
    val_res_catb = make_validation(
        df_full=df_full_train,
        upift_model=uplift_model_cl_tr_catb,
        try_feat=try_feat,
        pipeline_flg=False,
        n_iter=5
    )
     
    val_res_catb_df =\
    pd.DataFrame(val_res_catb).T
    
    out_res.append({
        'n_top': n_top,
        'score_catb': val_res_catb_df['score_val'].mean(),
    }, ignore_index=True)
    
    print(f"VAL MEAN: {val_res_catb_df['score_val'].mean():.5f} +- {val_res_catb_df['score_val'].std():.5f}")

Top feats: 100
Iteration number: 1
Iteration number: 2
Iteration number: 3
Iteration number: 4
Iteration number: 5
VAL MEAN: 0.08062 +- 0.00401
Top feats: 110
Iteration number: 1
Iteration number: 2
Iteration number: 3
Iteration number: 4
Iteration number: 5
VAL MEAN: 0.08109 +- 0.00743
Top feats: 120
Iteration number: 1
Iteration number: 2
Iteration number: 3
Iteration number: 4
Iteration number: 5
VAL MEAN: 0.08162 +- 0.00726
Top feats: 130
Iteration number: 1
Iteration number: 2
Iteration number: 3
Iteration number: 4
Iteration number: 5
VAL MEAN: 0.08147 +- 0.00708
Top feats: 140
Iteration number: 1
Iteration number: 2
Iteration number: 3
Iteration number: 4
Iteration number: 5
VAL MEAN: 0.08129 +- 0.00741


In [None]:
#df_full_train.loc[:,'treatment_flg'] = df_full_train['treatment_flg'].astype(str)

val_res_catb = make_validation(
    df_full=df_full_train,
    upift_model=uplift_model_cl_tr_lgbm,
    try_feat=try_feat,
    pipeline_flg=False,
    n_iter=15
)

In [None]:
df_full_train.loc[:,'treatment_flg'] =\
    df_full_train['treatment_flg'].astype(str)

val_res_catb = make_validation(
    df_full=df_full_train.values,
    upift_model=uplift_model_cml,
    try_feat=try_feat,
    pipeline_flg=False,
    n_iter=3
)

In [None]:
val_res_catb = make_validation(
    df_full=df_full_train,
    upift_model=uplift_model_cl_tr_rf,
    try_feat=try_feat,
    pipeline_flg=False,
    n_iter=3
)

In [138]:
val_res_catb = make_validation(
    df_full=df_full_train,
    upift_model=my_pipeline,
    try_feat=try_feat,
    pipeline_flg=True,
    n_iter=5
)

Iteration number: 1
Iteration number: 2
Iteration number: 3
Iteration number: 4
Iteration number: 5


#### FIT ON WHOLE TRAIN DF

In [None]:
uplift_model_cl_tr_catb.fit(
    df_full_train[try_feat], 
    df_full_train['target'],
    df_full_train['treatment_flg']
)

In [280]:
pred_test = uplift_model_cl_tr_catb.predict(df_full_test[try_feat])

df_submit = df_full_test\
    .set_index('client_id')\
    .assign(uplift=pred_test)[['uplift']]

print(df_submit.shape)
df_submit.head(2)

df_submit.to_csv('submissions/sub17_catb_1-1000_ex_cl_tr.csv')

(200123, 1)


In [281]:
!head -5 submissions/sub17_catb_1-1000_ex_cl_tr.csv

client_id,uplift
000048b7a6,0.05446345569856792
000073194a,0.04197991583015659
00007c7133,0.037999595985255485
00007f9014,0.023304079208585238


In [196]:
top_feats = catb_get_feature_imp(
    uplift_model_cl_tr2.estimator,
    uplift_model_cl_tr2.estimator.feature_names_,
    n_top=1000
)['feat'].values.tolist()

In [199]:
with open('top_all_feats.txt', 'w') as f_out:
    f_out.write(';'.join(top_feats))

-----

#### LOCAL RESEULTS

In [291]:
val_res_catb_df =\
    pd.DataFrame(val_res_catb).T

print(f"TRAIN MEAN: {val_res_catb_df['score_tr'].mean():.5f} +- {val_res_catb_df['score_tr'].std():.5f}")

print(f"VAL MEAN: {val_res_catb_df['score_val'].mean():.5f} +- {val_res_catb_df['score_val'].std():.5f}")

# Mask 77 feats, catb / 2 / 800
# 0.07803 0.00648

# All 260 all feats, catb / 2 / 800
# 0.07878 0.00477

# ex 150 feats, catb / 2 / 800
# 0.08046 0.00631

# lgbm 150
#VAL MEAN: 0.08052 +- 0.00372

#TRAIN MEAN: 0.11686 +- 0.00302
#VAL MEAN: 0.07917 +- 0.00538

TRAIN MEAN: 0.11617 +- 0.00349
VAL MEAN: 0.07917 +- 0.00538


-----

##### VALIDATE PIPELINE PARAMAS

In [None]:
out_res = []

xgb_try_params = [
  (1, 10, 1),
  (2, 20, 0.5),
  (2, 100, 0.5),
  (2, 200, 0.4),
  (3, 100, 0.5)  
]

for param in xgb_try_params:
    d, n, eta = param[0], param[1], param[2]
    print(f'Parms: {param}')

    xgb_est_params = {
        'max_depth':d, 
        'learning_rate':eta, 
        'n_estimators':n,
        'nthread':50,
        'n_gpus':0,
        'seed':0
    }

    xgb_est = XGBClassifier(**xgb_est_params)

    uplift_tmp_model_xgb_cl_tr = ClassTransformation(
        estimator=xgb_est
    )
    
    val_tmp = make_validation(
        df_full=df_full_train,
        upift_model=uplift_tmp_model_xgb_cl_tr,
        try_feat=try_feat,
        n_iter=20
    )
    
    val_res_df =\
        pd.DataFrame(val_tmp).T
    
    mean = np.round(val_res_df['score_val'].mean(), 5)
    std = np.round(val_res_df['score_val'].std(), 5)
    print(f'Mean: {mean}, Std: {std}\n')
          
    out_res.append(
        [param, (mean, std)]
    )