In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# For adjusting 'scales' of train, test1 and test2
from sklearn.preprocessing import RobustScaler
# For hyperparameter optimization
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score
# For amount_spent modeling
import lightgbm as lgb
# For survival_time modeling
import xgboost as xgb
# For screening survival, non-spent users modeling
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import RobustScaler

# Customizing functions
from MJ_functions import merge_dummy_data, categorical_to_int, return_final_data, return_true_label, return_multi_pred_label
from MJ_functions import hypertuning_rscv, plotImp_multioutput, score_function_2, find_best_threshold

In [None]:
import lime
import lime.lime_tabular

In [None]:
import warnings
warnings.filterwarnings("ignore")

# (1) Import Data and Select Features

In [None]:
# Train data(40000, 997)
merged_data = pd.read_csv("train_preprocess_1.csv").copy().sort_values('acc_id').reset_index(drop=True)
# Test1 data(40000, 995)
test_1 = pd.read_csv("test1_preprocess_1.csv").copy().sort_values('acc_id').reset_index(drop=True)
# Test2 data(40000, 995)
test_2 = pd.read_csv("test2_preprocess_1.csv").copy().sort_values('acc_id').reset_index(drop=True)

In [None]:
user_feature = ['acc_id'] # User id
label_feature = ['survival_time', 'amount_spent', 'survival_yn', 'amount_yn'] # Labels for profit modeling
#category = [i for i in merged_data.columns.values if ('common_item_sell'in i)|('common_item_buy'in i)|('sell_time'in i)|('buy_time'in i)|('sell_type'in i)|('buy_type'in i)|('last_sell_item_type'in i)|('last_buy_item_type'in i)]
remove_features = ['combat_days']+[i for i in merged_data.columns.values if ('day_1_' in i)|('day_4_' in i)|('day_8_' in i)|('day_17' in i)|('day_20' in i)|('day_21' in i)|('day_22' in i)|('day_23' in i)|('day_24' in i)|('day_25' in i)]
features = sorted(list(set(merged_data.columns) - set(user_feature+label_feature+remove_features)))
#scale_features = sorted(list(set(features)-set(category)))

In [None]:
features = sorted(list(set(merged_data.columns) - set(user_feature+label_feature+remove_features)))
len(features)
day_features = [x for x in features if 'day_' in x and '_day_' not in x]
wo_day_features = sorted(list(set(features) - set(day_features)))

In [None]:
merged_data['survival_yn'] = np.where(merged_data['survival_time']==64, 1, 0)
merged_data['amount_yn'] = np.where(merged_data['amount_spent']==0, 0, 1)

In [None]:
display(merged_data['survival_yn'].value_counts(), merged_data['survival_yn'].value_counts(normalize=True))

# (2) Robust Scaling and Train Test Split

In [None]:
scaling = 0

In [None]:
if scaling == True:
    all_data = pd.concat([merged_data, test_1, test_2], sort = True).reset_index(drop=True)
    transformer = RobustScaler().fit(all_data[features])

    merged_data[features] = transformer.transform(merged_data[features])
    test_1[features] = transformer.transform(test_1[features])
    test_2[features] = transformer.transform(test_2[features])

In [None]:
train_fold1 = pd.read_csv('train_fold2.csv')
test_fold1 = pd.read_csv('test_fold2.csv')
train_fold1_acc_id = train_fold1['acc_id'].reset_index(drop = True)
test_fold1_acc_id = test_fold1['acc_id'].reset_index(drop = True)

In [None]:
merged_train = merged_data[merged_data['acc_id'].isin(train_fold1_acc_id)]
merged_test = merged_data[merged_data['acc_id'].isin(test_fold1_acc_id)]

# (3) Binary Search

In [None]:
X_train_yn, y_train_yn = merged_train[wo_day_features], merged_train[['survival_yn', 'amount_yn']]
X_test_yn, y_test_yn = merged_test[wo_day_features], merged_test[['survival_yn', 'amount_yn']]

multi_rf_clf = MultiOutputClassifier(RandomForestClassifier(n_estimators=100,
                                                            random_state=0,
                                                            verbose=3,n_jobs = -1)).fit(X_train_yn, y_train_yn)  
true_label_yn = return_true_label(y_test_yn, merged_test)
pred_label_yn = return_multi_pred_label(multi_rf_clf, true_label_yn, X_test_yn)

sur_pred_res = pd.concat([pd.DataFrame(multi_rf_clf.predict_proba(X_test_yn)[0]), pred_label_yn[1][['pred_survival_time', 'survival_time']]], 1)
sur_pred_res.columns = ['survival_yn_prob_0', 'survival_yn_prob_1', 'pred_survival_yn', 'survival_yn']
ams_pred_res = pd.concat([pd.DataFrame(multi_rf_clf.predict_proba(X_test_yn)[1]), pred_label_yn[1][['pred_amount_spent', 'amount_spent']]], 1)
ams_pred_res.columns = ['amount_yn_prob_0', 'amount_yn_prob_1', 'pred_amount_yn', 'amount_yn']

## (a) Survival Time

In [None]:
train_array = merged_train[wo_day_features].as_matrix()
test_array = merged_test[wo_day_features].as_matrix()
rf_lst = list(pd.DataFrame(multi_rf_clf.estimators_[0].predict_proba(X_test_yn)).sort_values(0).index[800:810])+list(pd.DataFrame(multi_rf_clf.estimators_[0].predict_proba(X_test_yn)).sort_values(0).index[7190:7200])
for i in rf_lst:
    explainer_sur_rf = lime.lime_tabular.LimeTabularExplainer(train_array, feature_names=wo_day_features, class_names=[0,1])
    exp_rf_sur = explainer_sur_rf.explain_instance(test_array[i], multi_rf_clf.estimators_[0].predict_proba, num_features = 5)        
    exp_rf_sur.show_in_notebook()
    print(y_test_yn['survival_yn'].reset_index(drop=True)[i])
    for k in exp_rf_sur.as_list():
        print(k,sep='\n')
        

## (b) amount spent

In [None]:
train_array = merged_train[wo_day_features].as_matrix()
test_array = merged_test[wo_day_features].as_matrix()
rf_lst = list(pd.DataFrame(multi_rf_clf.estimators_[1].predict_proba(X_test_yn)).sort_values(0).index[800:810])+list(pd.DataFrame(multi_rf_clf.estimators_[1].predict_proba(X_test_yn)).sort_values(0).index[7190:7200])
for i in rf_lst:
    explainer_sur_rf = lime.lime_tabular.LimeTabularExplainer(train_array, feature_names=wo_day_features, class_names=[0,1])
    exp_rf_sur = explainer_sur_rf.explain_instance(test_array[i], multi_rf_clf.estimators_[1].predict_proba, num_features = 5)        
    exp_rf_sur.show_in_notebook()
    print(y_test_yn['amount_yn'].reset_index(drop=True)[i])
    for k in exp_rf_sur.as_list():
        print(k,sep='\n')

# (3) Regression

## (a) amount spent

In [None]:
opt_params= {'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 0.37758858328507827,
 'importance_type': 'split',
 'learning_rate': 0.01,
 'max_depth': 5,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 1000,
 'n_jobs': -1,
 'num_leaves': 999,
 'objective': 'rmse',
 'random_state': 42,
 'reg_alpha': 9.011935682890176,
 'reg_lambda': 1.6839355842740356,
 'silent': True,
 'subsample': 0.7436506067797896,
 'subsample_for_bin': 200000,
 'subsample_freq': 0}

In [None]:
X_train, y_train = merged_train[wo_day_features], merged_train['amount_spent']
X_test, y_test = merged_test[wo_day_features], merged_test['amount_spent']

lgbtrain = lgb.Dataset(X_train, label=y_train)
lgbval = lgb.Dataset(X_test, label=y_test)

num_rounds =10000
early_stopping_rounds=100

LGBM_opt_model = lgb.train(opt_params, lgbtrain, num_rounds, valid_sets = lgbval, early_stopping_rounds=100)

In [None]:
surv_reg = list(pd.DataFrame(LGBM_opt_model.predict(X_test)).sort_values(0,ascending=0).index[800:810])+list(pd.DataFrame(LGBM_opt_model.predict(X_test)).sort_values(0,ascending=0).index[7190:7200])

explainer = lime.lime_tabular.LimeTabularExplainer(X_train.values, 
                                                   feature_names=wo_day_features, class_names=['amount_spent'], verbose=True, mode='regression')
for i in surv_reg:
    exp = explainer.explain_instance(X_test.iloc[i,:], LGBM_opt_model.predict, num_features=7)
    exp.show_in_notebook(show_table=True)
    for k in exp.as_list():
        print(k,sep='\n')

## (b) survival time

In [None]:
#create a train and validation dmatrices 
X_train, y_train = merged_train[wo_day_features], merged_train['survival_time']
X_test, y_test = merged_test[wo_day_features], merged_test['survival_time']

xgb_opt_params = {'colsample_bytree': 0.4281763598169799,
 'gamma': 30.0,
 'max_depth': 453,
 'min_child_weight': 15.229362467690951,
 'subsample': 0.7,
 'eta': 0.1,
 'objective': 'count:poisson'}


xgtrain = xgb.DMatrix(X_train, label=y_train,feature_names=wo_day_features)
xgval = xgb.DMatrix(X_test, label=y_test,feature_names=wo_day_features)

watchlist = [(xgtrain, 'train'),(xgval, 'val')]

num_rounds =10000
#early_stopping_rounds=50

#create a train and validation dmatrices 
xgtrain = xgb.DMatrix(X_train, label=y_train,feature_names=wo_day_features)
xgval = xgb.DMatrix(X_test, label=y_test,feature_names=wo_day_features)

watchlist = [(xgtrain, 'train'),(xgval, 'val')]

num_rounds =10000
#early_stopping_rounds=50

rgrs = xgb.train(xgb_opt_params, xgtrain, num_rounds, watchlist, early_stopping_rounds=100)

In [None]:
xglst = list(pd.DataFrame(rgrs.predict(xgval)).sort_values(0).index[800:810])  + list(pd.DataFrame(rgrs.predict(xgval)).sort_values(0).index[7190:7200])

In [None]:
def wrapped_predict(data_x):
    '''
    wrap xgboost predict function in order to make it lime-friendly
    - model and feature_names are defined outside
    '''
    
    dummy_y = np.array([ 1 for _ in range(data_x.shape[0]) ])
    tmp_data = xgb.DMatrix(data_x, dummy_y, feature_names=wo_day_features)
    
    tmp_out = rgrs.predict(tmp_data)
    
    '''    # add the first column to make it like predict_proba
    out = np.zeros((data_x.shape[0], 2))
    out[:, 0] = 1-tmp_out
    out[:, 1] = tmp_out'''
    
    return tmp_out

In [None]:
explainer = lime.lime_tabular.LimeTabularExplainer(X_train.values, 
                                                   feature_names=wo_day_features, class_names=['survival_time'], verbose=True, mode='regression')
for i in xglst:
    exp = explainer.explain_instance(X_test.iloc[i,:], wrapped_predict, num_features=10)
    exp.show_in_notebook(show_table=True)
    for k in exp.as_list():
        print(k,sep='\n')