### - model validation on pre-pared data
### - make submit

In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
import dill
import sys
import pandas as pd
import numpy as np
import gc
import warnings
  
from dstools.spark import init_spark2

from dstools.ml import yandex_mean_encoder

from sklearn.model_selection import train_test_split

from sklift.models import SoloModel, ClassTransformation, TwoModels
from sklift.preprocess import balancer
from sklift.metrics import uplift_at_k

from causalml.inference.tree import UpliftTreeClassifier, UpliftRandomForestClassifier
from causalml.inference.tree import uplift_tree_string, uplift_tree_plot 

from sklearn.pipeline import Pipeline

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

warnings.filterwarnings('ignore') 

In [3]:
spark = init_spark2({
    "appName": "ret_baseline",
    "spark.yarn.queue": "default",
    "spark.driver.memory": "32g",
    "spark.executor.memoryOverhead": "2048",
    "spark.driver.extraJavaOptions": "-XX:ThreadStackSize=81920",
    "spark.executor.extraJavaOptions": "-XX:ThreadStackSize=81920",
    "spark.driver.maxResultSize": "45g",
    "spark.driver.memory": "45g"
})

In [4]:
sys.path.append('./scripts/')
from add_functions import catb_get_feature_imp, make_validation

-----

### load data

In [5]:
# read all data
sdf_cliens = spark.read.csv('/user/mvshevc5/retailhero/clients.csv', inferSchema=True, header=True)
sdf_products = spark.read.csv('/user/mvshevc5/retailhero/products.csv', inferSchema=True, header=True)
sdf_purchase = spark.read.csv('/user/mvshevc5/retailhero/purchases.csv', inferSchema=True, header=True)
sdf_train = spark.read.csv('/user/mvshevc5/retailhero/uplift_train.csv', inferSchema=True, header=True)
sdf_test = spark.read.csv('/user/mvshevc5/retailhero/uplift_test.csv', inferSchema=True, header=True)

In [8]:
df_clients = sdf_cliens.toPandas()
df_train = sdf_train.toPandas()
df_test = sdf_test.toPandas()

df_clients = df_clients.set_index('client_id')
df_train = df_train.set_index('client_id')
df_test = df_test.set_index('client_id')

In [21]:
# baseline features data:
base_train = pd.read_csv('data/feats_baseline_train.csv', index_col=0)
base_train = base_train.drop(['treatment_flg','target'], axis=1)
base_train.columns  = ['base_' + col for col in base_train.columns.tolist()]

print(base_train.shape)
base_train.head(2)

(200039, 23)


Unnamed: 0,base_first_issue_date,base_first_redeem_date,base_age,base_gender,base_total_trans_count,base_last_month_trans_count,base_regular_points_received_sum_all,base_express_points_received_sum_all,base_regular_points_spent_sum_all,base_express_points_spent_sum_all,base_purchase_sum_sum_all,base_store_id_sum_all,base_regular_points_received_sum_last_month,base_express_points_received_sum_last_month,base_regular_points_spent_sum_last_month,base_express_points_spent_sum_last_month,base_purchase_sum_sum_last_month,base_store_id_sum_last_month,base_first_issue_date_weekday,base_first_redeem_date_weekday,base_first_issue_date_hour,base_first_redeem_date_hour,base_diff
000012768d,1501948000.0,1515094000.0,45,85,4,2,25.7,0.0,0.0,0.0,2803.0,3,10.0,0.0,0.0,0.0,1222.0,1,5,3.0,15,19.0,13146559.0
000036f903,1491832000.0,1492951000.0,72,70,32,8,54.9,60.0,0.0,0.0,9805.0,5,13.7,0.0,0.0,0.0,2784.0,4,0,6.0,13,12.0,1118613.0


In [22]:
base_test = pd.read_csv('data/feats_baseline_test.csv', index_col=0)
base_test = base_test.drop(['target'], axis=1)
base_test.columns  = ['base_' + col for col in base_test.columns.tolist()]

print(base_test.shape)
base_test.head(2)

(200123, 23)


Unnamed: 0,base_first_issue_date,base_first_redeem_date,base_age,base_gender,base_total_trans_count,base_last_month_trans_count,base_regular_points_received_sum_all,base_express_points_received_sum_all,base_regular_points_spent_sum_all,base_express_points_spent_sum_all,base_purchase_sum_sum_all,base_store_id_sum_all,base_regular_points_received_sum_last_month,base_express_points_received_sum_last_month,base_regular_points_spent_sum_last_month,base_express_points_spent_sum_last_month,base_purchase_sum_sum_last_month,base_store_id_sum_last_month,base_first_issue_date_weekday,base_first_redeem_date_weekday,base_first_issue_date_hour,base_first_redeem_date_hour,base_diff
000048b7a6,1544881000.0,-9223372000.0,68,70,8,1,26.5,0.0,0.0,0.0,3772.0,2,1.2,0.0,0.0,0.0,342.0,1,5,,13,,-10768250000.0
000073194a,1495544000.0,1511522000.0,60,70,17,6,74.9,0.0,-96.0,0.0,9601.4,1,25.0,0.0,0.0,0.0,3393.3,1,1,4.0,12,11.0,15978110.0


In [23]:
# transform clients data

df_clients['first_issue_date_weekday'] = df_clients['first_issue_date'].dt.weekday
df_clients['first_redeem_date_weekday'] = df_clients['first_redeem_date'].dt.weekday
df_clients['first_issue_date_hour'] = df_clients['first_issue_date'].dt.hour
df_clients['first_redeem_date_hour'] = df_clients['first_redeem_date'].dt.hour
df_clients['first_issue_date'] = df_clients['first_issue_date'].astype(int)/10**9
df_clients['first_redeem_date'] = df_clients['first_redeem_date'].astype(int)/10**9
df_clients['diff'] = df_clients['first_redeem_date']-df_clients['first_issue_date']
df_clients['gender'] = list(ord(v[0]) for v in df_clients['gender'].values)

df_clients.columns = ['cl_' + col for col in df_clients.columns.tolist()]

In [24]:
# load all trans data:
df_total = pd.read_csv('data/putch_total_agg.csv', index_col=0)
df_total_cat = pd.read_csv('data/putch_total_agg_cat.csv', index_col=0)
df_total_date = pd.read_csv('data/putch_total_agg_date.csv', index_col=0)
df_last = pd.read_csv('data/putch_last_agg.csv', index_col=0)

In [35]:
df_full_train_feats = pd.concat([
    df_clients, df_total, df_total_cat, df_total_date, df_last, base_train
], axis=1)

df_full_train = df_full_train_feats.join(df_train, how='right')

df_full_test_feats = pd.concat([
    df_clients, df_total, df_total_cat, df_total_date, df_last, base_test
], axis=1)

df_full_test = df_full_test_feats.join(df_test, how='right')

del df_full_train_feats, df_full_test_feats
gc.collect()

print(f'Train shape {df_full_train.shape}')
print(f'Test shape {df_full_test.shape}')

#Train shape (200039, 129)
#Test shape (200123, 127)
df_full_train.head(2)

Train shape (200039, 129)
Test shape (200123, 127)


Unnamed: 0_level_0,cl_first_issue_date,cl_first_redeem_date,cl_age,cl_gender,cl_first_issue_date_weekday,cl_first_redeem_date_weekday,cl_first_issue_date_hour,cl_first_redeem_date_hour,cl_diff,total_count,transaction_id__nunique,regular_points_received__mean,regular_points_received__max,regular_points_received__min,regular_points_received__median,express_points_received__mean,express_points_received__max,express_points_received__min,express_points_received__median,regular_points_spent__mean,regular_points_spent__max,regular_points_spent__min,regular_points_spent__median,express_points_spent__mean,express_points_spent__max,express_points_spent__min,express_points_spent__median,purchase_sum__mean,purchase_sum__max,purchase_sum__min,purchase_sum__median,product_quantity__mean,product_quantity__max,product_quantity__min,product_quantity__median,trn_sum_from_iss__mean,trn_sum_from_iss__max,trn_sum_from_iss__min,trn_sum_from_iss__median,trn_sum_from_red__mean,trn_sum_from_red__max,trn_sum_from_red__min,trn_sum_from_red__median,netto__mean,netto__max,netto__min,netto__median,most_comm_product_id,nunique_product_id,most_comm_cnt_ratio_product_id,most_comm_sum_ratio_product_id,most_comm_transaction_id,nunique_transaction_id,most_comm_cnt_ratio_transaction_id,most_comm_sum_ratio_transaction_id,most_comm_store_id,nunique_store_id,most_comm_cnt_ratio_store_id,most_comm_sum_ratio_store_id,most_comm_level_1,nunique_level_1,most_comm_cnt_ratio_level_1,most_comm_sum_ratio_level_1,most_comm_level_2,nunique_level_2,most_comm_cnt_ratio_level_2,most_comm_sum_ratio_level_2,most_comm_level_3,nunique_level_3,most_comm_cnt_ratio_level_3,most_comm_sum_ratio_level_3,most_comm_level_4,nunique_level_4,most_comm_cnt_ratio_level_4,most_comm_sum_ratio_level_4,most_comm_segment_id,nunique_segment_id,most_comm_cnt_ratio_segment_id,most_comm_sum_ratio_segment_id,most_comm_vendor_id,nunique_vendor_id,most_comm_cnt_ratio_vendor_id,most_comm_sum_ratio_vendor_id,min_date_trans,max_date_trans,days_from_last_trans,lats_total_count,lats_transaction_id__nunique,lats_regular_points_received__mean,lats_regular_points_received__max,lats_regular_points_received__min,lats_regular_points_received__median,lats_regular_points_spent__mean,lats_regular_points_spent__max,lats_regular_points_spent__min,lats_regular_points_spent__median,lats_purchase_sum__mean,lats_purchase_sum__max,lats_purchase_sum__min,lats_purchase_sum__median,lats_product_quantity__mean,lats_product_quantity__max,lats_product_quantity__min,lats_product_quantity__median,base_first_issue_date,base_first_redeem_date,base_age,base_gender,base_total_trans_count,base_last_month_trans_count,base_regular_points_received_sum_all,base_express_points_received_sum_all,base_regular_points_spent_sum_all,base_express_points_spent_sum_all,base_purchase_sum_sum_all,base_store_id_sum_all,base_regular_points_received_sum_last_month,base_express_points_received_sum_last_month,base_regular_points_spent_sum_last_month,base_express_points_spent_sum_last_month,base_purchase_sum_sum_last_month,base_store_id_sum_last_month,base_first_issue_date_weekday,base_first_redeem_date_weekday,base_first_issue_date_hour,base_first_redeem_date_hour,base_diff,treatment_flg,target
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1
000012768d,1501948000.0,1515094000.0,45,85,5,3.0,15,19.0,13146559.0,52,4,7.551923,10.0,2.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,784.788462,1007.0,419.0,803.0,1.038462,4.0,0.0,1.0,53.942308,170.0,0.0,50.0,,,,,0.540231,1.5,0.018,0.4,057ea8df98,46,0.038462,0.024333,7e3e2e3984,4,0.365385,0.468843,017c89b915,3,0.423077,0.376437,e344ab2e71,3,0.519231,0.545002,ad2b2e17d2,8,0.326923,0.312627,ca69ed9de2,24,0.230769,0.215467,46951c62e8,31,0.076923,0.069298,1.0,23.0,0.078431,0.073518,43acd80c1a,29,0.076923,0.083094,1543648000.0,1552576000.0,292,22,2,6.363636,8.0,2.0,8.0,0.0,0.0,0.0,0.0,698.272727,803.0,419.0,803.0,0.863636,2.0,0.0,1.0,1501948000.0,1515094000.0,45.0,85.0,4.0,2.0,25.7,0.0,0.0,0.0,2803.0,3.0,10.0,0.0,0.0,0.0,1222.0,1.0,5.0,3.0,15.0,19.0,13146559.0,0,1
000036f903,1491832000.0,1492951000.0,72,70,0,6.0,13,12.0,1118613.0,162,32,2.15,7.0,0.2,1.6,2.222222,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,362.746914,700.0,58.0,328.0,1.04321,4.0,0.0,1.0,60.555556,367.0,0.0,54.5,,,,,0.481877,1.0,0.002,0.35,449e431b58,96,0.049383,0.049485,517dea5e24,32,0.055556,0.097558,6381a55c22,5,0.691358,0.64923,c3d3a8e8c6,3,0.537037,0.569795,034aca0659,12,0.234568,0.248481,0f84eb7480,39,0.111111,0.091534,420c3b3f0b,58,0.111111,0.091534,150.0,41.0,0.213836,0.194833,e6af81215a,44,0.234568,0.248481,1543402000.0,1552819000.0,289,44,8,1.9,2.5,0.4,2.1,0.0,0.0,0.0,0.0,384.613636,506.0,93.0,431.0,1.136364,4.0,0.0,1.0,1491832000.0,1492951000.0,72.0,70.0,32.0,8.0,54.9,60.0,0.0,0.0,9805.0,5.0,13.7,0.0,0.0,0.0,2784.0,4.0,0.0,6.0,13.0,12.0,1118613.0,1,1


-----

#### MODEL

In [46]:
try_feat = list(set(df_full_train.columns.tolist()) - set(['target', 'treatment_col']))

cat_cols = df_full_train.select_dtypes(include='object').columns.tolist()
cat_cols = list(set(cat_cols).intersection(set(try_feat)))

In [77]:
with open('models/uplift_my_and_baseline_feats_two_models_catb.dill', 'rb') as f_in:
    model = dill.load(f_in)
    
feat_imp = catb_get_feature_imp(
    catb_est=model.estimator_trmnt,
    features_list=df_full_train\
        .drop(['treatment_flg', 'target'], axis=1)\
        .columns\
        .tolist(), 
    n_top=80
)

top_feats = feat_imp['feat'].values.tolist()

cat_top_feats = df_full_train[top_feats]\
    .select_dtypes(include='object')\
    .columns\
    .tolist()
    
try_feat = list(
   set(top_feats) - set(cat_top_feats)
)

len(try_feat)

74

In [108]:
## PREPARE DATA FOR XGB / LGBM MODEL, fit transform target_encodet on train data
enc = yandex_mean_encoder(
    columns=cat_top_feats, alpha=100
)

xgb_est_params = {
    'max_depth':2, 
    'learning_rate':0.5, 
    'n_estimators':20,
    'nthread':30,
    'n_gpus':0,
    'seed':234
}

estimator = XGBClassifier(**xgb_est_params)

estimator_trmnt = XGBClassifier(**xgb_est_params)
estimator_ctrl = XGBClassifier(**xgb_est_params)

uplift_model_2mod = TwoModels(
    estimator_trmnt=estimator_trmnt, 
    estimator_ctrl=estimator_ctrl
)

uplift_model_cl_tr = ClassTransformation(
    estimator=estimator
)

upl_xgb_cl_tr_pipeline = Pipeline(steps=[
    ('enc', enc),
    ('est', uplift_model_cl_tr)
])

###### SAME FOR 2 mdodels

In [None]:
try_feat = list(set(try_feat) - set(cat_cols))

val_res_all_feats_xgb_2mod = make_validation(
    df_full=df_full_train,
    upift_model=uplift_model_2mod,
    try_feat=try_feat,
    n_iter=20
)

val_res_all_feats_xgb_cl_tr = make_validation(
    df_full=df_full_train,
    upift_model=uplift_model_cl_tr,
    try_feat=try_feat,
    n_iter=20
)

In [None]:
try_feat = top_feats

val_res_all_feats_pipeline = make_validation(
    df_full=df_full_train,
    upift_model=upl_xgb_cl_tr_pipeline,
    try_feat=try_feat,
    pipeline_flg=True,
    n_iter=20
)

In [107]:
val_res_all_feats_xgb_cl_tr_df = pd.DataFrame(val_res_all_feats_xgb_cl_tr).T
val_res_all_feats_pipeline_df = pd.DataFrame(val_res_all_feats_pipeline).T

print(
    np.round(val_res_all_feats_xgb_cl_tr_df['score_val'].mean(), 5), 
    np.round(val_res_all_feats_xgb_cl_tr_df['score_val'].std(), 5)
)

print(
    np.round(val_res_all_feats_xgb_2mod_df['score_val'].mean(), 5), 
    np.round(val_res_all_feats_xgb_2mod_df['score_val'].std(), 5)
)

print(
    np.round(val_res_all_feats_pipeline_df['score_val'].mean(), 5), 
    np.round(val_res_all_feats_pipeline_df['score_val'].std(), 5)
)

# ALL cont 127 feats: 
# 0.55418 / 0.00949

# TOP 76 cont feats:
# 0.07135 / 0.00708

# TOP 76 cont feats + cat top 6 feats:
# 0.07233 0.00512 
# 0.07038 0.00508

0.0699 0.00691
0.0699 0.00691
0.07038 0.00508


-----

##### Learn on full train df and submit:

In [None]:
uplift_model_cl_tr.fit(
    X=df_full_train[try_feat],
    y=df_full_train['target'],
    treatment=df_full_train['treatment_flg']   
)

In [65]:
upl_final_pred_test = uplift_model_cl_tr.predict(df_full_test[try_feat])

In [66]:
df_submit = df_full_test.assign(uplift=upl_final_pred_test)[['uplift']]

print(df_submit.shape)
df_submit.head(2)

(200123, 1)


Unnamed: 0_level_0,uplift
client_id,Unnamed: 1_level_1
000048b7a6,0.046865
000073194a,0.057391


In [67]:
df_submit.to_csv('submissions/sub8_myrealfeats_xgb_d2_cl_tr.csv')
!head -2 'submissions/sub8_myrealfeats_xgb_d2_cl_tr.csv'

# Public: 0,0874

client_id,uplift
000048b7a6,0.04686451


----