In [1]:
import sys
import pandas as pd
import utils
import utils_bux
import featuretools as ft
from sklearn.externals import joblib

### DEFINE PIPELINE PARAMETERS

In [2]:
load_to_vertica = True
save_as_csv = False

# the timeframe of extracted users
# users_from = '2016-01-01'
# users_till = '2016-01-07'
cohort_size = 10000000

# the timeframe of extracted behavioral data
interval = '6 days'

# the type of the prediction problem
# 'regression', 'binary classification', 'multiclass classification'
prediction_problem_type = 'multiclass classification'

# # multiclass values
# medium_value = 5
# high_value = 50

print("Pipeline parameters defined")
print("Extraction of scoring for users from", users_from, "till", users_till)

Pipeline parameters defined
Extraction of scoring for users from 2016-01-01 till 2016-01-07


### CONNECT TO THE DATABASE

In [3]:
# connect to the vertica database, create a cursor
conn, cur = utils.connect_to_db()

Connected to the database


### BUILD ENTITIES

#### Cohorts entity

In [4]:
cohorts = utils_bux.build_cohorts_entity(cur=cur,
                                         users_from=users_from,
                                         users_till=users_till)

Cohorts entity built


#### Users entity

In [5]:
user_details = utils_bux.build_users_entity(cur=cur,
                                            users_from=users_from,
                                            users_till=users_till,
                                            interval=interval,
                                            cohorts=cohorts,
                                            cohort_size=cohort_size)

Users entity built with 7891 users


#### Transactions entity

In [9]:
daily_transactions = utils_bux.build_transactions_entity(cur=cur,
                                                         interval=interval)

Transactions entity built with 55237 transactions


In [10]:
### no need for labels

### CREATE THE ENTITY SET

In [11]:
es = utils_bux.create_bux_entity_set(cohorts, user_details, daily_transactions)
es

Entity set built


Entityset: bux_clv
  Entities:
    users (shape = [7891, 32])
    transactions (shape = [55237, 18])
  Relationships:
    transactions.user_id -> users.user_id

### FEATURE ENGINEERING (DFS)

In [12]:
top_features = ft.load_features("top_features", es)
fm = utils.calculate_feature_matrix_top_features(es, top_features)
X = fm.reset_index(drop=True).fillna(0)
print("Features built:\n", list(fm.columns))

Features built:
 ['MEAN(transactions.trades_sb_invested_amount)', 'MAX(transactions.trades_sb_invested_amount)', 'SUM(transactions.trades_sb_invested_amount)', 'MAX(transactions.total_session_duration)', 'NUM_UNIQUE(transactions.trades_sb_commission)', 'MAX(transactions.trades_sb_long)', 'STD(transactions.trades_sb_invested_amount)', 'MEAN(transactions.trades_sb_open_positions)', 'SUM(transactions.total_session_duration)', 'STD(transactions.trades_sb_long)', 'SUM(transactions.view_position)', 'SUM(transactions.trades_sb_long)', 'STD(transactions.total_session_duration)', 'NUM_UNIQUE(transactions.financing_deposits_amount)', 'MEAN(transactions.total_session_duration)', 'MEAN(transactions.view_position)', 'MEAN(transactions.trades_sb_long)', 'STD(transactions.trades_sb_short)', 'SUM(transactions.trades_sb_open_positions)', 'MAX(transactions.view_position)', 'Position Closed_did_event', 'STD(transactions.view_position)', 'STD(transactions.trades_sb_open_positions)', 'MAX(transactions.trad

### LOADING THE MODEL

In [13]:
model = joblib.load('models/model.pkl')
print("Model loaded")

Model loaded


### SCORING

In [28]:
y_pred = utils.rf_predict(model, X, prediction_problem_type)
print("Prediction done")

Prediction done


In [30]:
# save predictions in a csv
predictions = pd.DataFrame()
predictions["user_id"] = user_details["user_id"]
predictions["topic_type"] = "clv_prediction"
predictions['report_date'] = pd.to_datetime('today').strftime("%Y-%m-%d")
predictions["model_type"] = "xgboost"
predictions["class_prediction"] = y_pred
predictions["prob"] = 0
predictions = predictions[["topic_type", "report_date", "model_type", "user_id", "class_prediction", "prob"]]
predictions.head()

Unnamed: 0,topic_type,report_date,model_type,user_id,class_prediction,prob
0,clv_prediction,2018-05-25,xgboost,0032cdbe-5d18-4dfc-82d0-ab990e15d7af,0.0,0
1,clv_prediction,2018-05-25,xgboost,006f908c-6c61-437e-b026-2bb245fca3dd,0.0,0
2,clv_prediction,2018-05-25,xgboost,00f916da-5908-414a-99df-45e916209a52,0.0,0
3,clv_prediction,2018-05-25,xgboost,01745aa2-f52a-44b4-85fb-a54f5551358f,0.0,0
4,clv_prediction,2018-05-25,xgboost,0286d747-554a-475a-9737-c0aa9a33ec0e,0.0,0


### LOAD RESULTS INTO  THE DATABASE

In [None]:
# ### SAVE RESULTS AS A CSV
if save_as_csv:
    predictions.to_csv("scoring/clv_prediction_" + users_from + "-" + users_till, index=False)
# load to vertica
# conn, cur = utils.connect_to_db()
# os.chdir('/home/jo/Documents/Master thesis @ BUX/notebooks/scoring')
# predictions = dd.read_csv('*.csv').compute()
# print("Number of users:", len(predictions))

In [None]:
if load_to_vertica:
    utils_bux.copy_to_vertica(predictions, 'analytics.model_scoring_predictions', conn)

In [None]:
# when running as a script
if __name__ == "__main__":
    users_from = sys.argv[1]
    users_till = sys.argv[2]
    main(users_from, users_till)