####  MAKE FEATURES / uplift different pipelines / validation

- data aggregation, last data features, total aggs
- set uplift pipelines
- get validation scores
- search model best params with validation

In [None]:
%load_ext autoreload
%autoreload 2

In [16]:
import dill
import sys
import pandas as pd
import numpy as np
import gc
import warnings
  
from dstools.spark import init_spark2

from dstools.ml import yandex_mean_encoder

from sklearn.model_selection import train_test_split

from sklift.models import SoloModel, ClassTransformation, TwoModels
from sklift.metrics import uplift_at_k

from causalml.inference.tree import UpliftTreeClassifier, UpliftRandomForestClassifier
from causalml.inference.tree import uplift_tree_string, uplift_tree_plot 

from sklearn.pipeline import Pipeline

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

warnings.filterwarnings('ignore')

In [5]:
#!/data/dfs1/home/kvliksak/Feature_importance/research_venv_py37/bin/python -m pip install causalml
#!/data/dfs1/home/kvliksak/Feature_importance/research_venv_py37/bin/python -m pip install scikit-uplift 

In [6]:
spark = init_spark2({
    "appName": "ret_baseline",
    "spark.yarn.queue": "default",
    "spark.driver.memory": "32g",
    "spark.executor.memoryOverhead": "2048",
    "spark.driver.extraJavaOptions": "-XX:ThreadStackSize=81920",
    "spark.executor.extraJavaOptions": "-XX:ThreadStackSize=81920",
    "spark.driver.maxResultSize": "45g",
    "spark.driver.memory": "45g"
})

In [7]:
sys.path.append('./scripts/')
from add_functions import catb_get_feature_imp, make_validation

----

##### LOAD DATA

In [8]:
# read all data
sdf_cliens = spark.read.csv('/user/mvshevc5/retailhero/clients.csv', inferSchema=True, header=True)
sdf_products = spark.read.csv('/user/mvshevc5/retailhero/products.csv', inferSchema=True, header=True)
sdf_purchase = spark.read.csv('/user/mvshevc5/retailhero/purchases.csv', inferSchema=True, header=True)
sdf_train = spark.read.csv('/user/mvshevc5/retailhero/uplift_train.csv', inferSchema=True, header=True)
sdf_test = spark.read.csv('/user/mvshevc5/retailhero/uplift_test.csv', inferSchema=True, header=True)

In [9]:
sdf_purchase = sdf_purchase.join(sdf_products, on=['product_id'], how='left')

In [10]:
df_clients = sdf_cliens.toPandas()
df_train = sdf_train.toPandas()
df_test = sdf_test.toPandas()
df_products = sdf_products.toPandas()
df_purchases = sdf_purchase.toPandas()

In [11]:
df_clients = df_clients.set_index('client_id')
df_train = df_train.set_index('client_id')
df_test = df_test.set_index('client_id')
df_products = df_products.set_index('product_id')

df_purchases.loc[:,'transaction_datetime'] =  pd.to_datetime(df_purchases['transaction_datetime'])
df_clients.loc[:,'first_issue_date'] = pd.to_datetime(df_clients['first_issue_date'])
df_clients.loc[:,'first_redeem_date'] = pd.to_datetime(df_clients['first_redeem_date'])

In [12]:
# with products
df_purchases.shape
# (45 786 568, 23)

(45786568, 23)

----

#### MAKE FEATURES FROM purchase and products:

In [13]:
df_purchases.head(2)

Unnamed: 0,product_id,client_id,transaction_id,transaction_datetime,regular_points_received,express_points_received,regular_points_spent,express_points_spent,purchase_sum,store_id,product_quantity,trn_sum_from_iss,trn_sum_from_red,level_1,level_2,level_3,level_4,segment_id,brand_id,vendor_id,netto,is_own_trademark,is_alcohol
0,9a80204f78,000012768d,7e3e2e3984,2018-12-01 07:12:45,10.0,0.0,0.0,0.0,1007.0,54a4a11a29,2.0,80.0,,e344ab2e71,ed2ad1797c,b25baa9dd5,51647c28e9,116.0,082560ca58,63243765ed,0.031,0,0
1,da89ebd374,000012768d,7e3e2e3984,2018-12-01 07:12:45,10.0,0.0,0.0,0.0,1007.0,54a4a11a29,1.0,65.0,,e344ab2e71,ed2ad1797c,0767853bf3,eaeb795060,14.0,cab440afaf,43acd80c1a,0.4,1,0


In [None]:
id_cols =  ['product_id', 'transaction_id', 'store_id']

cont_cols = ['regular_points_received','express_points_received', 'regular_points_spent', 'express_points_spent',
    'purchase_sum', 'product_quantity', 'trn_sum_from_iss', 'trn_sum_from_red', 'netto']

cat_cols = ['level_1', 'level_2', 'level_3', 'level_4', 'segment_id', 'vendor_id']

binary_cols = ['is_own_trademark', 'is_alcohol']

In [None]:
agg_cols_1 = ['mean', 'max', 'min', 'median']
agg_cols_2 = ['nunique']

df_purch_total = df_purchases\
    .groupby('client_id')\
    .agg({
        'transaction_id': ['count', 'nunique'],
        'regular_points_received': agg_cols_1,
        'express_points_received': agg_cols_1,
        'regular_points_spent': agg_cols_1,
        'express_points_spent': agg_cols_1,
        'purchase_sum': agg_cols_1,
        'product_quantity': agg_cols_1,
        'trn_sum_from_iss': agg_cols_1,
        'trn_sum_from_red': agg_cols_1,
        'netto': agg_cols_1
    })
    
df_purch_total.columns =  ['__'.join(col).strip() for col in df_purch_total.columns.values]
df_purch_total = df_purch_total.rename(columns={'transaction_id__count':'total_count'})

In [None]:
print(df_purch_total.shape)
df_purch_total.head(2)

In [150]:
# save:
df_purch_total.to_csv('data/putch_total_agg.csv')

In [160]:
%%time
cols = id_cols + cat_cols

df_purch_cat_aggs = []

for col in cols:
    print(f'Aggregate column: {col}')
    df_col_agg = get_category_feat(df_purchases, column=col)
    df_purch_cat_aggs.append(df_col_agg)
    
#  5min 48s

Aggregate column: product_id
Aggregate column: transaction_id
Aggregate column: store_id
Aggregate column: level_1
Aggregate column: level_2
Aggregate column: level_3
Aggregate column: level_4
Aggregate column: segment_id
Aggregate column: vendor_id
CPU times: user 5min 49s, sys: 207 ms, total: 5min 49s
Wall time: 5min 48s


In [161]:
df_purch_total_cat = pd.concat(df_purch_cat_aggs, axis=1)

In [162]:
print(df_purch_total_cat.shape)
df_purch_total_cat.head(2)

(400162, 36)


Unnamed: 0,most_comm_product_id,nunique_product_id,most_comm_cnt_ratio_product_id,most_comm_sum_ratio_product_id,most_comm_transaction_id,nunique_transaction_id,most_comm_cnt_ratio_transaction_id,most_comm_sum_ratio_transaction_id,most_comm_store_id,nunique_store_id,most_comm_cnt_ratio_store_id,most_comm_sum_ratio_store_id,most_comm_level_1,nunique_level_1,most_comm_cnt_ratio_level_1,most_comm_sum_ratio_level_1,most_comm_level_2,nunique_level_2,most_comm_cnt_ratio_level_2,most_comm_sum_ratio_level_2,most_comm_level_3,nunique_level_3,most_comm_cnt_ratio_level_3,most_comm_sum_ratio_level_3,most_comm_level_4,nunique_level_4,most_comm_cnt_ratio_level_4,most_comm_sum_ratio_level_4,most_comm_segment_id,nunique_segment_id,most_comm_cnt_ratio_segment_id,most_comm_sum_ratio_segment_id,most_comm_vendor_id,nunique_vendor_id,most_comm_cnt_ratio_vendor_id,most_comm_sum_ratio_vendor_id
000012768d,057ea8df98,46,0.038462,0.024333,7e3e2e3984,4,0.365385,0.468843,017c89b915,3,0.423077,0.376437,e344ab2e71,3,0.519231,0.545002,ad2b2e17d2,8,0.326923,0.312627,ca69ed9de2,24,0.230769,0.215467,46951c62e8,31,0.076923,0.069298,1.0,23.0,0.078431,0.073518,43acd80c1a,29,0.076923,0.083094
000036f903,449e431b58,96,0.049383,0.049485,517dea5e24,32,0.055556,0.097558,6381a55c22,5,0.691358,0.64923,c3d3a8e8c6,3,0.537037,0.569795,034aca0659,12,0.234568,0.248481,0f84eb7480,39,0.111111,0.091534,420c3b3f0b,58,0.111111,0.091534,150.0,41.0,0.213836,0.194833,e6af81215a,44,0.234568,0.248481


In [163]:
# save:
df_purch_total_cat.to_csv('data/putch_total_agg_cat.csv')

##### Agg last data:

In [159]:
df_purchases.head(2)

Unnamed: 0,product_id,client_id,transaction_id,transaction_datetime,regular_points_received,express_points_received,regular_points_spent,express_points_spent,purchase_sum,store_id,product_quantity,trn_sum_from_iss,trn_sum_from_red,level_1,level_2,level_3,level_4,segment_id,brand_id,vendor_id,netto,is_own_trademark,is_alcohol
0,9a80204f78,000012768d,7e3e2e3984,2018-12-01 07:12:45,10.0,0.0,0.0,0.0,1007.0,54a4a11a29,2.0,80.0,,e344ab2e71,ed2ad1797c,b25baa9dd5,51647c28e9,116.0,082560ca58,63243765ed,0.031,0,0
1,da89ebd374,000012768d,7e3e2e3984,2018-12-01 07:12:45,10.0,0.0,0.0,0.0,1007.0,54a4a11a29,1.0,65.0,,e344ab2e71,ed2ad1797c,0767853bf3,eaeb795060,14.0,cab440afaf,43acd80c1a,0.4,1,0


In [None]:
%%time
FIX_DATE = pd.to_datetime('2020-01-01')

df_purch_date_aggs = df_purchases.groupby('client_id')\
 .agg({'transaction_datetime': ['min', 'max']})

df_purch_date_aggs.columns = ['min_date', 'max_date']
df_purch_date_aggs.loc[:,'days_from_last_trans'] = (FIX_DATE - df_purch_date_aggs['max_date']).dt.days.astype(int)
df_purch_date_aggs.loc[:,'min_date_trans'] = df_purch_date_aggs['min_date'].astype(int)/10**9
df_purch_date_aggs.loc[:,'max_date_trans'] = df_purch_date_aggs['max_date'].astype(int)/10**9

df_purch_date_aggs = df_purch_date_aggs[['min_date_trans', 'max_date_trans', 'days_from_last_trans']]

In [None]:
print(df_purch_date_aggs.shape)
df_purch_date_aggs.head(2)

In [214]:
# save:
df_purch_date_aggs.to_csv('data/putch_total_agg_date.csv')

In [216]:
## LAST DATES AGG:

TRANS_DATE = '2019-02-18'

df_purch_last = df_purchases[df_purchases['transaction_datetime'] > TRANS_DATE]\
    .groupby('client_id')\
    .agg({
        'transaction_id': ['count', 'nunique'],
        'regular_points_received': agg_cols_1,
        'regular_points_spent': agg_cols_1,
        'purchase_sum': agg_cols_1,
        'product_quantity': agg_cols_1,
    })
    
df_purch_last.columns =  ['__'.join(col).strip() for col in df_purch_last.columns.values]
df_purch_last = df_purch_last.rename(columns={'transaction_id__count':'total_count'})
df_purch_last.columns = ['lats_' + col for col in df_purch_last.columns.tolist() ]

In [223]:
print(df_purch_last.shape)
df_purch_last.head(2)

(400162, 18)


Unnamed: 0_level_0,lats_total_count,lats_transaction_id__nunique,lats_regular_points_received__mean,lats_regular_points_received__max,lats_regular_points_received__min,lats_regular_points_received__median,lats_regular_points_spent__mean,lats_regular_points_spent__max,lats_regular_points_spent__min,lats_regular_points_spent__median,lats_purchase_sum__mean,lats_purchase_sum__max,lats_purchase_sum__min,lats_purchase_sum__median,lats_product_quantity__mean,lats_product_quantity__max,lats_product_quantity__min,lats_product_quantity__median
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
000012768d,22,2,6.363636,8.0,2.0,8.0,0.0,0.0,0.0,0.0,698.272727,803.0,419.0,803.0,0.863636,2.0,0.0,1.0
000036f903,44,8,1.9,2.5,0.4,2.1,0.0,0.0,0.0,0.0,384.613636,506.0,93.0,431.0,1.136364,4.0,0.0,1.0


In [288]:
# save:
df_purch_last.to_csv('data/putch_last_agg.csv')

----

#### JOIN ALL DATA

In [181]:
df_clients = sdf_cliens.toPandas()
df_train = sdf_train.toPandas()
df_test = sdf_test.toPandas()

df_clients = df_clients.set_index('client_id')
df_train = df_train.set_index('client_id')
df_test = df_test.set_index('client_id')

In [182]:
# baseline features data:
base_train = pd.read_csv('data/feats_baseline_train.csv', index_col=0)
base_train = base_train.drop(['treatment_flg','target'], axis=1)
base_train.columns  = ['base_' + col for col in base_train.columns.tolist()]

print(base_train.shape)
base_train.head(2)

(200039, 23)


Unnamed: 0,base_first_issue_date,base_first_redeem_date,base_age,base_gender,base_total_trans_count,base_last_month_trans_count,base_regular_points_received_sum_all,base_express_points_received_sum_all,base_regular_points_spent_sum_all,base_express_points_spent_sum_all,base_purchase_sum_sum_all,base_store_id_sum_all,base_regular_points_received_sum_last_month,base_express_points_received_sum_last_month,base_regular_points_spent_sum_last_month,base_express_points_spent_sum_last_month,base_purchase_sum_sum_last_month,base_store_id_sum_last_month,base_first_issue_date_weekday,base_first_redeem_date_weekday,base_first_issue_date_hour,base_first_redeem_date_hour,base_diff
000012768d,1501948000.0,1515094000.0,45,85,4,2,25.7,0.0,0.0,0.0,2803.0,3,10.0,0.0,0.0,0.0,1222.0,1,5,3.0,15,19.0,13146559.0
000036f903,1491832000.0,1492951000.0,72,70,32,8,54.9,60.0,0.0,0.0,9805.0,5,13.7,0.0,0.0,0.0,2784.0,4,0,6.0,13,12.0,1118613.0


In [183]:
base_test = pd.read_csv('data/feats_baseline_test.csv', index_col=0)
base_test = base_test.drop(['target'], axis=1)
base_test.columns  = ['base_' + col for col in base_test.columns.tolist()]

print(base_test.shape)
base_test.head(2)

(200123, 23)


Unnamed: 0,base_first_issue_date,base_first_redeem_date,base_age,base_gender,base_total_trans_count,base_last_month_trans_count,base_regular_points_received_sum_all,base_express_points_received_sum_all,base_regular_points_spent_sum_all,base_express_points_spent_sum_all,base_purchase_sum_sum_all,base_store_id_sum_all,base_regular_points_received_sum_last_month,base_express_points_received_sum_last_month,base_regular_points_spent_sum_last_month,base_express_points_spent_sum_last_month,base_purchase_sum_sum_last_month,base_store_id_sum_last_month,base_first_issue_date_weekday,base_first_redeem_date_weekday,base_first_issue_date_hour,base_first_redeem_date_hour,base_diff
000048b7a6,1544881000.0,-9223372000.0,68,70,8,1,26.5,0.0,0.0,0.0,3772.0,2,1.2,0.0,0.0,0.0,342.0,1,5,,13,,-10768250000.0
000073194a,1495544000.0,1511522000.0,60,70,17,6,74.9,0.0,-96.0,0.0,9601.4,1,25.0,0.0,0.0,0.0,3393.3,1,1,4.0,12,11.0,15978110.0


In [184]:
# transform clients data

df_clients['first_issue_date_weekday'] = df_clients['first_issue_date'].dt.weekday
df_clients['first_redeem_date_weekday'] = df_clients['first_redeem_date'].dt.weekday
df_clients['first_issue_date_hour'] = df_clients['first_issue_date'].dt.hour
df_clients['first_redeem_date_hour'] = df_clients['first_redeem_date'].dt.hour
df_clients['first_issue_date'] = df_clients['first_issue_date'].astype(int)/10**9
df_clients['first_redeem_date'] = df_clients['first_redeem_date'].astype(int)/10**9
df_clients['diff'] = df_clients['first_redeem_date']-df_clients['first_issue_date']
df_clients['gender'] = list(ord(v[0]) for v in df_clients['gender'].values)

df_clients.columns = ['cl_' + col for col in df_clients.columns.tolist()]

In [185]:
print(df_clients.shape)
df_clients.head(2)

(400162, 9)


Unnamed: 0_level_0,cl_first_issue_date,cl_first_redeem_date,cl_age,cl_gender,cl_first_issue_date_weekday,cl_first_redeem_date_weekday,cl_first_issue_date_hour,cl_first_redeem_date_hour,cl_diff
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
000012768d,1501948000.0,1515094000.0,45,85,5,3.0,15,19.0,13146559.0
000036f903,1491832000.0,1492951000.0,72,70,0,6.0,13,12.0,1118613.0


In [186]:
# load all trans data:
df_total = pd.read_csv('data/putch_total_agg.csv', index_col=0)
df_total_cat = pd.read_csv('data/putch_total_agg_cat.csv', index_col=0)
df_total_date = pd.read_csv('data/putch_total_agg_date.csv', index_col=0)
df_last = pd.read_csv('data/putch_last_agg.csv', index_col=0)

In [188]:
df_full_train = pd.concat([
    df_clients, df_total, df_total_cat, df_total_date, df_last, base_train
], axis=1)

df_full_test = pd.concat([
    df_clients, df_total, df_total_cat, df_total_date, df_last, base_test
], axis=1)

df_full_train = df_train.join(df_full_train)
df_full_test = df_test.join(df_full_test)

print(f'Train full shape: {df_full.shape}')
print(f'Test full shape: {df_full_test.shape}')

#Train full shape: (400162, 127)
#Test full shape: (200123, 127)

Train full shape: (400162, 127)
Test full shape: (200123, 127)


----

##### TRAIN TEST SPLIT

In [19]:
df_tr, df_val = train_test_split(
    df_full_train, test_size=0.3, random_state=2
)

-----

###### LEARN MODEL

In [136]:
try_feat = df_full.columns.tolist()

cat_cols = df_full.select_dtypes(include='object').columns.tolist()
cat_cols = list(set(cat_cols).intersection(set(try_feat)))

In [32]:
## PREPARE DATA FOR XGB / LGBM MODEL, fit transform target_encodet on train data
enc = yandex_mean_encoder(columns=cat_cols, alpha=100)

enc.fit(
    df_tr, y=df_tr['target']
)

df_tr_mod = enc.transform(df_tr)
df_val_mod = enc.transform(df_val)

In [84]:
enc = yandex_mean_encoder(columns=cat_cols, alpha=100)

enc.fit(
    df_full_train, y=df_full_train['target']
)

df_full_train_mod = enc.transform(df_full_train)
df_full_test_mod = enc.transform(df_full_test)

In [102]:
# SET models


lgbm_params = {
    'learning_rate':0.03,
    'max_depth':4,
    'n_estimators': 480,
    'num_leaves':20,
    'min_data_in_leaf':3,
    'application':'binary',
    'subsample':0.8, 
    'colsample_bytree': 0.8,
    'reg_alpha':0.01,
    'data_random_seed':42,
    'metric':'binary_logloss',
    'max_bin':416,
    'bagging_freq':3,
    'reg_lambda':0.01,
    'num_leaves':20,
    'nthread': 20,
    'seed': 42
    }

catb_params = {
    'n_estimators': 500,
    'max_depth':10,
    'thread_count': 50,
    'random_state': 42
}

estimator_trmnt = CatBoostClassifier(
    cat_features=cat_cols,  verbose=200, random_state=42, task_type='GPU', devices='1'
)

# estimator_ctrl = CatBoostClassifier(
#     cat_features=cat_cols,  verbose=200, random_state=42, task_type='GPU', devices='1'
# )

# estimator_trmnt = CatBoostClassifier(cat_features=cat_cols, thread_count=50, verbose=200)
# estimator_ctrl = CatBoostClassifier(cat_features=cat_cols, thread_count=50, verbose=200)

#estimator_trmnt = CatBoostClassifier(cat_features=cat_cols, **catb_params, verbose=200)
#estimator_ctrl = CatBoostClassifier(cat_features=cat_cols, **catb_params, verbose=200)

#estimator_trmnt = CatBoostClassifier( thread_count=50, verbose=200)
#estimator_ctrl = CatBoostClassifier( thread_count=50, verbose=200)

#estimator_trmnt = LGBMClassifier(**lgbm_params, verbose=200)
#estimator_ctrl  = LGBMClassifier(**lgbm_params, verbose=200)

# treatment_model.fit(treatment_x,treatment_y)

uplift_models = TwoModels(
    estimator_trmnt=estimator_trmnt, 
    estimator_ctrl=estimator_ctrl
)

uplift_model_cl_tr = ClassTransformation(
    estimator=estimator_trmnt
)

In [16]:
## PREPARE DATA FOR XGB / LGBM MODEL, fit transform target_encodet on train data
enc = yandex_mean_encoder(
    columns=cat_cols, alpha=100
)

xgb_est_params = {
    'max_depth':2, 
    'learning_rate':0.5, 
    'n_estimators':20,
    'nthread':20,
    'n_gpus':0,
    'seed':10023
}

estimator = XGBClassifier(**xgb_est_params)


uplift_model_cl_tr = ClassTransformation(
    estimator=estimator
)

my_pipeline = Pipeline(steps=[
    ('enc', enc),
    ('est', uplift_model_cl_tr)
])

In [None]:
my_pipeline.fit(
    X=df_full_train[try_feat],
    y=df_full_train['target'],
    est__treatment=df_full_train['treatment_flg']
) 

In [144]:
pred_test = my_pipeline.predict(
    X=df_full_test[try_feat],
)

##### get  top feats from allready learned model by feature importance

In [14]:
with open('models/uplift_my_and_baseline_feats_two_models_catb.dill', 'rb') as f_in:
    model = dill.load(f_in)
    
feat_imp = catb_get_feature_imp(
    catb_est=model.estimator_trmnt,
    features_list=df_full_train\
        .drop(['treatment_flg', 'target'], axis=1)\
        .columns\
        .tolist(), 
    n_top=80
)

top_feats = feat_imp['feat'].values.tolist()

cat_top_feats = df_full_train[top_feats]\
    .select_dtypes(include='object')\
    .columns\
    .tolist()

#### set different try feats list

In [59]:
# try_feat = [f for f in  df_tr.columns if 'lats' not in f ]
# len(try_feat)

try_feat = list(
    set(top_feats) - set(cat_top_feats)
)

len(try_feat)

74

In [None]:
%%time
uplift_models.fit(
    X=df_tr.drop(['treatment_flg', 'target'], axis=1),
    y=df_tr['target'],
    treatment=df_tr['treatment_flg']   
)

#try_feat = [f for f in df_tr.columns if 'base' in f ]
try_feat = [f for f in  df_tr.columns if 'lats' not in f ]

uplift_models.fit(
    X=df_tr[try_feat],
    y=df_tr['target'],
    treatment=df_tr['treatment_flg']   
)

In [None]:
uplift_model_cl_tr.fit(
    X=df_full_train_mod.drop(['treatment_flg', 'target'], axis=1),
    y=df_full_train_mod['target'],
    treatment=df_full_train_mod['treatment_flg']   
)

In [None]:
uplift_model_xgb_cl_tr.fit(
     X=df_full_train[try_feat],
     y=df_full_train['target'],
     treatment=df_full_train['treatment_flg']   
 )

In [None]:
uplift_model_cl_tr.fit(
    X=df_full_train[try_feat],
    y=df_full_train['target'],
    treatment=df_full_train['treatment_flg']   
)

### predicts

In [20]:
upl_final_test = uplift_models.predict(df_full_test)

In [117]:
#upl_final_test = uplift_model_cl_tr.predict(df_full_test)
upl_final_test = uplift_model_xgb_cl_tr.predict(df_full_test[try_feat])

In [119]:
#df_submit = df_full_test.assign(uplift=upl_final_test)[['uplift']]
df_submit = df_full_test.assign(uplift=upl_final_test)[['uplift']]

print(df_submit.shape)
df_submit.head(2)

(200123, 1)


Unnamed: 0_level_0,uplift
client_id,Unnamed: 1_level_1
000048b7a6,0.351318
000073194a,0.424085


In [120]:
df_submit.to_csv('submissions/sub7_myrealfeats_xgb_d2_cl_tr.csv')

In [121]:
!head -2 'submissions/sub7_myrealfeats_xgb_d2_cl_tr.csv'

client_id,uplift
000048b7a6,0.35131788


In [73]:
upl_pred_cl_tr_tr = uplift_model_cl_tr.predict(df_tr[try_feat])
upl_pred_cl_tr_val = uplift_model_cl_tr.predict(df_val[try_feat])

In [42]:
upl_at_30_tr = np.round(uplift_at_k(
    y_true=df_tr['target'].values, 
    uplift=upl_pred_tr, 
    treatment=df_tr['treatment_flg'].values, 
    k=0.3
),5)

upl_at_30_val = np.round(uplift_at_k(
    y_true=df_val['target'].values, 
    uplift=upl_pred_val, 
    treatment=df_val['treatment_flg'].values, 
    k=0.3
),5)

print(f'Uplift@30% train: {upl_at_30_tr}')
print(f'Uplift@30% val  : {upl_at_30_val}')

# My feat 2 models catboost default cpu, 2 models
# Uplift@30% train: 0.45471
# Uplift@30% val  : 0.07812

# My feat + base feat 2 models catboost default cpu, 2 models
# Uplift@30% train: 0.45823
# Uplift@30% val  : 0.07113 

# Only base feats, 2 models catboost default cpu, 2 models
# Uplift@30% train: 0.35197
# Uplift@30% val  : 0.06207

#  Only base feats, 2 models lgbm 100 trees
# Uplift@30% train: 0.11573
# Uplift@30% val  : 0.07085

# Only base feats, 2 models lgbm 500 trees
# Uplift@30% train: 0.2005
# Uplift@30% val  : 0.06995

# CATB, 500 trees, depth=10
#Uplift@30% train: 0.86665
#Uplift@30% val  : 0.05907

# CATB, my feats 104 GPU baseline, PUBLIC: 0.071
#Uplift@30% train: 0.41384
#Uplift@30% val  : 0.0546/ 0.0547 / 0.07622

Uplift@30% train: 0.38502
Uplift@30% val  : 0.0737


In [74]:
upl_at_30_tr = np.round(uplift_at_k(
    y_true=df_tr['target'].values, 
    uplift=upl_pred_cl_tr_tr, 
    treatment=df_tr['treatment_flg'].values, 
    k=0.3
),5)

upl_at_30_val = np.round(uplift_at_k(
    y_true=df_val['target'].values, 
    uplift=upl_pred_cl_tr_val, 
    treatment=df_val['treatment_flg'].values, 
    k=0.3
),5)

print(f'Uplift@30% train: {upl_at_30_tr}')
print(f'Uplift@30% val  : {upl_at_30_val}')

# Uplift@30% train: 0.50008
# Uplift@30% val  : 0.08324

#Uplift@30% train: 0.17886
#Uplift@30% val  : 0.06671

Uplift@30% train: 0.17886
Uplift@30% val  : 0.06671


### MODEL VALIDATION

In [112]:
# XGB baseline

xgb_est_params = {
    'max_depth':2, 
    'learning_rate':0.5, 
    'n_estimators':20,
    'nthread':20,
    'n_gpus':0,
    'seed':10023
}

xgb_est_trmnt = XGBClassifier(**xgb_est_params)
xgb_est_cntr = XGBClassifier(**xgb_est_params)

uplift_model_xgb_cl_tr = ClassTransformation(
    estimator=xgb_est_trmnt
)

uplift_model_xgb_2mod = TwoModels(
    estimator_trmnt=xgb_est_trmnt,
    estimator_ctrl=xgb_est_cntr
)

In [97]:
### SET MODEL

cat_cols = df_full_train\
    .select_dtypes(include='object')\
    .columns.tolist()
    
cat_cols = list(set(cat_cols)\
                .intersection(set(try_feat)))

estimator_trmnt = CatBoostClassifier(
    cat_features=cat_cols,  verbose=200, random_state=42, task_type='GPU', devices='1'
)

estimator_ctrl = CatBoostClassifier(
    cat_features=cat_cols,  verbose=200, random_state=42, task_type='GPU', devices='1'
)

uplift_model_cl_tr = ClassTransformation(
    estimator=estimator_trmnt
)

uplift_model_2mod = TwoModels(
    estimator_trmnt=estimator_trmnt,
    estimator_ctrl=estimator_ctrl
)

In [None]:
val_cl_tr = make_validation(
    df_full=df_full_train,
    upift_model=uplift_model_xgb_cl_tr,
    try_feat=try_feat,
    n_iter=30
)

In [110]:
val_cl_tr_df =\
    pd.DataFrame(val_cl_tr).T

print(
    np.round(val_cl_tr_df['score_val'].mean(), 5), 
    np.round(val_cl_tr_df['score_val'].std(), 5)
)

0.07265 0.00687


In [None]:
val_res_all_feats_2mod = make_validation(
    df_full=df_full_train,
    upift_model=uplift_model_2mod,
    try_feat=try_feat,
    n_iter=30
)

##### SEARCH BEST PARAMS WITH ITERAIMVE VALIDATION

In [None]:
out_res = []

xgb_try_params = [
  (1, 10, 1),
  (2, 20, 0.5),
  (2, 100, 0.5),
  (2, 200, 0.4),
  (3, 100, 0.5)  
]

for param in xgb_try_params:
    d, n, eta = param[0], param[1], param[2]
    print(f'Parms: {param}')

    xgb_est_params = {
        'max_depth':d, 
        'learning_rate':eta, 
        'n_estimators':n,
        'nthread':50,
        'n_gpus':0,
        'seed':0
    }

    xgb_est = XGBClassifier(**xgb_est_params)

    uplift_tmp_model_xgb_cl_tr = ClassTransformation(
        estimator=xgb_est
    )
    
    val_tmp = make_validation(
        df_full=df_full_train,
        upift_model=uplift_tmp_model_xgb_cl_tr,
        try_feat=try_feat,
        n_iter=20
    )
    
    val_res_df =\
        pd.DataFrame(val_tmp).T
    
    mean = np.round(val_res_df['score_val'].mean(), 5)
    std = np.round(val_res_df['score_val'].std(), 5)
    print(f'Mean: {mean}, Std: {std}\n')
          
    out_res.append(
        [param, (mean, std)]
    )

In [84]:
out_res

[[(1, 10, 1), (0.05752, 0.00662)],
 [(2, 20, 0.5), (0.07135, 0.00708)],
 [(2, 100, 0.5), (0.06838, 0.00653)],
 [(2, 200, 0.4), (0.06838, 0.00662)],
 [(3, 100, 0.5), (0.06645, 0.00444)]]

In [64]:
val_res_all_feats_2mod_df = pd.DataFrame(val_res_all_feats_cl_tr).T
val_res_all_feats_cl_tr_df = pd.DataFrame(val_res_all_feats_2mod).T

In [65]:
# CLass trans winss
print(
    np.round(val_res_all_feats_cl_tr_df['score_val'].mean(), 5), 
    np.round(val_res_all_feats_cl_tr_df['score_val'].std(), 5)
)

# CL TRANSF ON CATB ON ALL 125 features workes better, and less std

0.06408 0.00567


In [None]:
# CLass trans winss
print(
    np.round(val_res_all_feats_2mod_df['score_val'].mean(), 5), 
    np.round(val_res_all_feats_2mod_df['score_val'].std(), 5)
)

print(
    np.round(val_res_all_feats_cl_tr_df['score_val'].mean(), 5), 
    np.round(val_res_all_feats_cl_tr_df['score_val'].std(), 5)
)
# 2 mod Catb gpu, all feats: 0.06592 / 0.0062
# Cl tr catb gpu: all_feats: 0.06999 / 0.00573

##### FEATURE IMPORTANCE

In [None]:
feat_imp = catb_get_feature_imp(
    catb_est=uplift_models.estimator_trmnt, 
    features_list=df_tr.drop(['treatment_flg', 'target'], axis=1).columns.tolist(), n_top=50
)

In [53]:
feat_imp

Unnamed: 0,feat,value
105,base_first_redeem_date,6.371143
1,cl_first_redeem_date,5.156744
55,most_comm_store_id,3.064702
47,most_comm_product_id,2.150233
79,most_comm_vendor_id,1.5788
81,most_comm_cnt_ratio_vendor_id,1.552867
83,min_date_trans,1.539354
84,max_date_trans,1.515702
67,most_comm_level_3,1.475879
71,most_comm_level_4,1.450214


In [36]:
# days_from_last_trans is it ok? 
feat_imp

Unnamed: 0,feat,value
1,cl_first_redeem_date,24.547691
87,lats_transaction_id__nunique,5.660717
52,nunique_transaction_id,3.879558
10,transaction_id__nunique,3.609283
84,max_date_trans,3.468024
53,most_comm_cnt_ratio_transaction_id,2.693755
85,days_from_last_trans,2.680469
83,min_date_trans,2.169165
2,cl_age,1.678595
0,cl_first_issue_date,1.56032


In [None]:
### TO DO 
## SELECT TOP FEATURES