In [1]:
import pandas as pd
import utils
import utils_bux
import featuretools as ft
from sklearn.externals import joblib



### DEFINE THE PIPELINE PARAMETERS

In [5]:
show_report = True
save_model = False

# the timeframe of extracted users
users_from = '2016-10-01'
users_till = '2016-11-01'
# users_till = '2017-09-30'
cohort_size = 2000

# the timeframe of extracted behavioral data
interval = '1 week'

# the type of the prediction problem
# 'regression', 'binary classification', 'multiclass classification'
prediction_problem_type = 'multiclass classification'

# multiclass values
medium_value = 5
high_value = 50

# number of the most important features to extract
number_of_features = 20

print("Pipeline parameters defined")

Pipeline parameters defined


### CONNECT TO THE DATABASE

In [6]:
# connect to the vertica database, create a cursor
cur = utils.connect_to_db()
print("Connected to the database")

Connected to the database


### BUILD ENTITY SET AND LABELS

#### Cohorts entity

In [7]:
cohorts = utils_bux.build_cohorts_entity(cur=cur,
                                         users_from=users_from,
                                         users_till=users_till)
print("Cohorts entity built")

Cohorts entity built


#### Users entity

In [8]:
user_details = utils_bux.build_users_entity(cur=cur,
                                            users_from=users_from,
                                            users_till=users_till,
                                            interval=interval,
                                            cohorts=cohorts,
                                            cohort_size=cohort_size)
print("Users entity built")

Users entity built


#### Transactions entity

In [10]:
daily_transactions = utils_bux.build_transactions_entity(cur=cur,
                                                         interval=interval)
print("Transactions entity built")

Transactions entity built


#### Labels

In [11]:
labels = utils_bux.build_target_values(cur=cur,
                                       medium_value=medium_value,
                                       high_value=high_value)
print("Target values built")

Target values built


### CREATE THE ENTITY SET

In [12]:
# entities
# cohorts = pd.read_csv("data/cohorts.csv")
# user_details = pd.read_csv("data/users_1y_6mCustomerValue_2000_3w.csv")
# daily_transactions = pd.read_csv('data/cube_1y_6mCustomerValue_2000_3w.csv')

# target values
# labels = pd.read_csv('data/curcv_1y_6mCustomerValue_2000_3w.csv')

In [13]:
# problem with the fillna (initial deposit lim and days to initial deposit)
es = utils_bux.create_bux_entity_set(cohorts, user_details, daily_transactions)
es

Entityset: bux_clv
  Entities:
    transactions (shape = [28144, 18])
    cohorts (shape = [5, 11])
    users (shape = [3354, 37])
  Relationships:
    users.cohort_id -> cohorts.cohort_id
    transactions.user_id -> users.user_id

### FEATURE ENGINEERING (DFS)

In [14]:
from featuretools.primitives import (Sum, Std, Max, Min, Mean,
                                 Count, PercentTrue, NUnique, 
                                 Day, Week, Month, Weekday, Weekend)


trans_primitives = [Day, Week, Month, Weekday, Weekend]
agg_primitives = [Sum, Std, Max, Min, Mean, Count, PercentTrue, NUnique]



fm_encoded, features_encoded = utils.calculate_feature_matrix_unparallel(es,
                                                                         "users",
                                                                         trans_primitives=trans_primitives,
                                                                         agg_primitives=agg_primitives,
                                                                         max_depth=2)
X = fm_encoded.reset_index().merge(labels)

Building features: 527it [00:00, 1731.49it/s]
Progress: 100%|██████████| 1/1 [01:04<00:00, 64.61s/cutoff time]
303 features generated


### TRAINING AND PREDICTION

In [15]:
# define the labels based on the prediction problem type
X, y = utils.make_labels(X, prediction_problem_type)
# split the data into training and testing
X_train, X_test, y_train, y_test = utils.train_test_splitting(X, y)
# fit the model
model = utils.xgboost_train(X_train, y_train, prediction_problem_type)
# predict on the testing set
y_pred = utils.xgboost_predict(model, X_test, prediction_problem_type)
# extract the most important features
top_features = utils.feature_importances(model, features_encoded, n=number_of_features)
# save the top features
ft.save_features(top_features, "top_features")
print("Features built and the most important features saved")

1: Feature: <Feature: MONTH(ams_first_funded_dts) = unknown>, 0.079
2: Feature: <Feature: SUM(transactions.view_position)>, 0.060
3: Feature: <Feature: STD(transactions.trades_sb_invested_amount)>, 0.056
4: Feature: <Feature: Banner Clicked_hours_till_event>, 0.043
5: Feature: <Feature: network = Organic>, 0.039
6: Feature: <Feature: Position Opened_hours_till_event>, 0.036
7: Feature: <Feature: MAX(transactions.trades_sb_short)>, 0.032
8: Feature: <Feature: COUNT(transactions)>, 0.030
9: Feature: <Feature: News Item Opened_hours_till_event>, 0.030
10: Feature: <Feature: MAX(transactions.total_session_duration)>, 0.030
11: Feature: <Feature: DAY(ams_first_funded_dts) = unknown>, 0.029
12: Feature: <Feature: PERCENT_TRUE(transactions.IS_WEEKEND(date))>, 0.026
13: Feature: <Feature: WEEKDAY(bux_account_created_dts) = 0>, 0.025
14: Feature: <Feature: MEAN(transactions.view_position)>, 0.024
15: Feature: <Feature: MONTH(ams_first_funded_dts) = 11.0>, 0.021
16: Feature: <Feature: MEAN(trans

### SAVE THE MODEL

In [16]:
if save_model == True:
    joblib.dump(model, 'models/model.pkl')
print("Model saved")

Model saved


### REPORT

In [17]:
if show_report == True:
    # execute the report
    print("Report shown")

Report shown


In [18]:
y_pred_round_xgb = [1 if value > 0.5 else 0 for value in y_pred]

In [None]:
from sklearn import metrics

def plot_roc_curve(y_test, y_pred):
    auc = metrics.roc_auc_score(y_test, y_pred)
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)

    plt.plot(fpr, tpr)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.rcParams['font.size'] = 12
    plt.title('ROC curve, AUC: ' + str(auc))
    plt.xlabel('False Positive Rate (1 - Specificity)')
    plt.ylabel('True Positive Rate (Sensitivity)')
    plt.grid(True)
    
plot_roc_curve(y_test, y_pred)

In [None]:
cm = confusion_matrix(y_test, y_pred.round(0))
utils.plot_confusion_matrix(cm, ['Non-whale', 'Whale'], title='Customer lifetime value prediction (Confusion matrix)')

In [None]:
scores = cross_val_score(model, X, y, cv=5, scoring='f1')
print("F1: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

#### LIME

In [None]:
import lime
import lime.lime_tabular