# Complete experiment to try to estimate the probability of an offer to "succeed" with one customer. The "informative" offers are considered successful if the customer views them. Other kinds of offers are successful only if the customer views them, and then completes them.

Table of contents <a id='top'>

1. [Get the data and show it](#data)
2. [Create the Model](#model)
3. [Evaluate the Model](#eval)
4. [Analysis and Conclusions](#conclusions)
5. [Test Results](#test)

In [1]:
import pandas as pd
import numpy as np
import math
import json
import os
import matplotlib.pyplot as plt
%matplotlib inline

%load_ext autoreload
%autoreload 2

from jupyterthemes import jtplot
jtplot.style(theme='solarizedd')
plt.rcParams['figure.figsize'] = (20.0, 10.0)

import data_utils_mt.utils as utils
import seaborn as sns
from sklearn.pipeline import Pipeline
import datetime as dt
from xgboost import XGBClassifier

ROOT_DIR = '../..'
DATA_DIR = os.path.join(ROOT_DIR, 'data')
DATA_RAW = os.path.join(DATA_DIR, 'raw')
DATA_INTERIM = os.path.join(DATA_DIR, 'interim')
DATA_EXTERNAL = os.path.join(DATA_DIR, 'external')
DATA_PROCESSED = os.path.join(DATA_DIR, 'processed')
SRC = os.path.join(ROOT_DIR, 'src')

STATIC_DATASET_PATH = os.path.join(DATA_INTERIM, 'static_data.pkl')

import sys
sys.path.append(SRC)

import src.data.preprocessing as pp
import src.data.success_dataset as sd
import src.data.missing_data as md
import src.evaluation.offer_success as evos

## 1. Get the data and show it <a id='data'/>
[Top](#top)

In [2]:
# Get the data
X_train_val, X_test, y_train_val, y_test, encoder = sd.get_success_data(
    basic_dataset_path=STATIC_DATASET_PATH,
    drop_time=False)

# Time-split validation datasets
X_test = sd.drop_time_dependent(X_test)
X_train, X_val, y_train, y_val = pp.time_split(X_train_val, 
                                               y_train_val,
                                               time_limit=370)

In [3]:
print(X_train.shape)
print(y_train.shape)
X_train.head()

(38030, 13)
(38030,)


Unnamed: 0,age,gender,income,missing_demographics,member_epoch_days,difficulty,duration,offer_type,reward_t,channel_web,channel_email,channel_social,channel_mobile
0,33.0,M,72000.0,0,17277,0.0,3.0,informational,0.0,0.0,1.0,1.0,1.0
1,33.0,M,72000.0,0,17277,0.0,4.0,informational,0.0,1.0,1.0,0.0,1.0
5,,,,1,17646,5.0,5.0,bogo,5.0,1.0,1.0,1.0,1.0
7,40.0,O,57000.0,0,17540,0.0,4.0,informational,0.0,1.0,1.0,0.0,1.0
8,40.0,O,57000.0,0,17540,7.0,7.0,discount,3.0,1.0,1.0,1.0,1.0


In [4]:
print(X_val.shape)
print(y_val.shape)
X_val.head()

(12778, 13)
(12778,)


Unnamed: 0,age,gender,income,missing_demographics,member_epoch_days,difficulty,duration,offer_type,reward_t,channel_web,channel_email,channel_social,channel_mobile
2,33.0,M,72000.0,0,17277,5.0,5.0,bogo,5.0,1.0,1.0,1.0,1.0
10,40.0,O,57000.0,0,17540,20.0,10.0,discount,5.0,1.0,1.0,0.0,0.0
15,59.0,F,90000.0,0,16864,10.0,5.0,bogo,10.0,1.0,1.0,1.0,1.0
19,24.0,F,60000.0,0,17116,0.0,3.0,informational,0.0,0.0,1.0,1.0,1.0
24,26.0,F,73000.0,0,17338,10.0,10.0,discount,2.0,1.0,1.0,1.0,1.0


In [5]:
print(X_test.shape)
print(y_test.shape)
X_test.head()

(25469, 13)
(25469,)


Unnamed: 0,age,gender,income,missing_demographics,member_epoch_days,difficulty,duration,offer_type,reward_t,channel_web,channel_email,channel_social,channel_mobile
3,33.0,M,72000.0,0,17277,10.0,10.0,discount,2.0,1.0,1.0,1.0,1.0
4,33.0,M,72000.0,0,17277,10.0,7.0,discount,2.0,1.0,1.0,0.0,1.0
6,,,,1,17646,5.0,5.0,bogo,5.0,1.0,1.0,1.0,1.0
11,40.0,O,57000.0,0,17540,5.0,7.0,bogo,5.0,1.0,1.0,0.0,1.0
16,59.0,F,90000.0,0,16864,0.0,3.0,informational,0.0,0.0,1.0,1.0,1.0


## 2. Create the model <a id='model'/>
[Top](#top)

In [6]:
from sklearn.model_selection import GridSearchCV

In [7]:
base_model = Pipeline([
    ('encoder', pp.BasicEncoder()),
    ('imputer', md.BasicImputer()),
    ('estimator', XGBClassifier(max_depth=7, n_estimators=200, n_jobs=-1,
                                random_state=2018))
])

In [8]:
# Grid search for better parameters

parameters = {
    'estimator__max_depth': [4, 7],
    'estimator__n_estimators': [10, 200, 500],
    'estimator__subsample': [0.5, 1.0],
    'estimator__colsample_bytree': [0.5, 0.7, 1.0],
    'estimator__colsample_bylevel': [0.5, 0.7, 1.0]
}
cv = GridSearchCV(base_model, parameters, cv=3, n_jobs=-1)

%time cv.fit(X_train, y_train)

CPU times: user 10.2 s, sys: 285 ms, total: 10.5 s
Wall time: 10min 32s


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('encoder', BasicEncoder()), ('imputer', BasicImputer()), ('estimator', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=7, min_child_weight=1, mi...=2018, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'estimator__max_depth': [4, 7], 'estimator__n_estimators': [10, 200, 500], 'estimator__subsample': [0.5, 1.0], 'estimator__colsample_bytree': [0.5, 0.7, 1.0], 'estimator__colsample_bylevel': [0.5, 0.7, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [9]:
cv.best_params_

{'estimator__colsample_bylevel': 1.0,
 'estimator__colsample_bytree': 0.7,
 'estimator__max_depth': 4,
 'estimator__n_estimators': 200,
 'estimator__subsample': 1.0}

In [10]:
model = cv.best_estimator_
model.get_params()

{'memory': None,
 'steps': [('encoder', BasicEncoder()),
  ('imputer', BasicImputer()),
  ('estimator',
   XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1.0,
          colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_delta_step=0,
          max_depth=4, min_child_weight=1, missing=None, n_estimators=200,
          n_jobs=-1, nthread=None, objective='binary:logistic',
          random_state=2018, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
          seed=None, silent=True, subsample=1.0))],
 'encoder': BasicEncoder(),
 'imputer': BasicImputer(),
 'estimator': XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1.0,
        colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_delta_step=0,
        max_depth=4, min_child_weight=1, missing=None, n_estimators=200,
        n_jobs=-1, nthread=None, objective='binary:logistic',
        random_state=2018, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
        seed=None, silent=True, subsample=1.0),
 'e

## 3. Evaluate the model <a id='eval'/>
[Top](#top)

### Time-split Validation

In [11]:
trained_model, y_train_pred, y_val_pred = evos.time_split_validation(model)

Training time: 1.590562105178833 seconds.
--------------------------------------------TRAIN RESULTS--------------------------------------------
Confusion Matrix:
[[15222  4910]
 [ 5327 12571]]
Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.76      0.75     20132
           1       0.72      0.70      0.71     17898

   micro avg       0.73      0.73      0.73     38030
   macro avg       0.73      0.73      0.73     38030
weighted avg       0.73      0.73      0.73     38030

----------------------------------------------------------------------------------------------------
--------------------------------------------TEST RESULTS--------------------------------------------
Confusion Matrix:
[[4960 1783]
 [2017 4018]]
Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.74      0.72      6743
           1       0.69      0.67      0.68      6035

   micro avg       0.70

### Customer-split validation

In [12]:
evos.random_1fold_cust_validation(model)

Training time: 1.63970947265625 seconds.
--------------------------------------------TRAIN RESULTS--------------------------------------------
Confusion Matrix:
[[14256  4612]
 [ 5073 11616]]
Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.76      0.75     18868
           1       0.72      0.70      0.71     16689

   micro avg       0.73      0.73      0.73     35557
   macro avg       0.73      0.73      0.73     35557
weighted avg       0.73      0.73      0.73     35557

----------------------------------------------------------------------------------------------------
--------------------------------------------TEST RESULTS--------------------------------------------
Confusion Matrix:
[[5849 2158]
 [2177 5067]]
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.73      0.73      8007
           1       0.70      0.70      0.70      7244

   micro avg       0.72 

## 4. Analysis and Conclusions <a id='conclusions'/>
[Top](#top)

## 5. Test Results (only run this once, after adjusting all the hyperparameters) <a id='test'/>
[Top](#top)

In [13]:
evos.offer_success_test(model)

Training time: 3.88137149810791 seconds.
--------------------------------------------TRAIN RESULTS--------------------------------------------
Confusion Matrix:
[[20054  6821]
 [ 7194 16739]]
Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.75      0.74     26875
           1       0.71      0.70      0.70     23933

   micro avg       0.72      0.72      0.72     50808
   macro avg       0.72      0.72      0.72     50808
weighted avg       0.72      0.72      0.72     50808

----------------------------------------------------------------------------------------------------
--------------------------------------------TEST RESULTS--------------------------------------------
Confusion Matrix:
[[10118  3870]
 [ 3508  7973]]
Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.72      0.73     13988
           1       0.67      0.69      0.68     11481

   micro avg       0