In [10]:
import sys
import pandas as pd
import utils
import utils_bux
import featuretools as ft
from sklearn.externals import joblib

### DEFINE PIPELINE PARAMETERS

In [11]:
load_to_database = False
save_as_csv = False

# the timeframe of extracted users
users_from = '2018-04-01'
users_till = '2018-04-30'

# include all users in each of the cohorts
cohort_size = 1000000000

# the timeframe of extracted behavioral data
interval = '3 weeks'

# the type of the prediction problem
# 'regression', 'binary classification', 'multiclass classification'
prediction_problem_type = 'binary classification'

print("Pipeline parameters defined")
print("Extraction of scoring for users from", users_from, "till", users_till)

Pipeline parameters defined
Extraction of scoring for users from 2018-04-01 till 2018-04-30


### CONNECT TO THE DATABASE

In [12]:
conn, cur = utils.connect_to_db()

Connected to the database


### BUILD ENTITIES

#### Cohorts entity

In [13]:
cohorts = utils_bux.build_cohorts_entity(cur=cur,
                                         users_from=users_from,
                                         users_till=users_till)

Cohorts entity built


#### Users entity

In [14]:
user_details = utils_bux.build_users_entity(cur=cur,
                                            users_from=users_from,
                                            users_till=users_till,
                                            interval=interval,
                                            cohorts=cohorts,
                                            cohort_size=cohort_size)

Users entity built with 36523 users


#### Transactions entity

In [15]:
daily_transactions = utils_bux.build_transactions_entity(cur=cur,
                                                         interval=interval)

Transactions entity built with 803506 transactions


### CREATE THE ENTITY SET

In [16]:
es = utils_bux.create_bux_entity_set(cohorts, user_details, daily_transactions)
es

Entity set built


Entityset: bux_clv
  Entities:
    users (shape = [36523, 33])
    cohorts (shape = [5, 11])
    transactions (shape = [803506, 18])
  Relationships:
    users.cohort_id -> cohorts.cohort_id
    transactions.user_id -> users.user_id

### FEATURE ENGINEERING (DFS)

In [17]:
top_features = ft.load_features("top_features", es)
fm = utils.calculate_feature_matrix_top_features(es, top_features)
X = fm.reset_index(drop=True).fillna(0)
print("Features built:\n", list(fm.columns))

Features built:
 ['MAX(transactions.trades_sb_long)', 'STD(transactions.trades_sb_open_positions)', 'NUM_UNIQUE(transactions.trades_sb_commission)', 'STD(transactions.trades_sb_long)', 'MEAN(transactions.trades_sb_long)', 'MAX(transactions.conversion_to_sb)', 'SUM(transactions.trades_sb_open_positions)', 'STD(transactions.trades_sb_invested_amount)', 'MAX(transactions.view_position)', 'MEAN(transactions.trades_sb_open_positions)', 'MAX(transactions.trades_sb_short)', 'MEAN(transactions.trades_sb_invested_amount)', 'MEAN(transactions.total_session_duration)', 'SUM(transactions.view_position)', 'MEAN(transactions.trades_sb_short)', 'SUM(transactions.trades_sb_invested_amount)', 'Conversion Completed_hours_till_event', 'SUM(transactions.total_session_duration)', 'SUM(transactions.trades_sb_long)', 'STD(transactions.view_position)']


### LOADING THE MODEL

In [18]:
model = joblib.load('models/model.pkl')
print("Model loaded")

Model loaded


### SCORING

In [19]:
y_pred = utils.rf_predict(model, X, prediction_problem_type)
print("Prediction done")

Prediction done


In [20]:
# save predictions in a csv
predictions = pd.DataFrame()
predictions["user_id"] = user_details["user_id"]
predictions["topic_type"] = "clv_prediction"
predictions['report_date'] = pd.to_datetime('today').strftime("%Y-%m-%d")
predictions["model_type"] = "randomforest"
predictions["class_prediction"] = y_pred
predictions["prob"] = 0
predictions = predictions[["topic_type", "report_date", "model_type", "user_id", "class_prediction", "prob"]]
predictions.head()

Unnamed: 0,topic_type,report_date,model_type,user_id,class_prediction,prob
0,clv_prediction,2018-05-30,randomforest,00428a27-c6c8-4f77-9a7b-4475219bb6af,0.0,0
1,clv_prediction,2018-05-30,randomforest,00c9c2db-e0e2-4521-ad83-928276fe9e58,0.0,0
2,clv_prediction,2018-05-30,randomforest,0103f512-ec62-4214-8640-64a0532ef2a2,0.0,0
3,clv_prediction,2018-05-30,randomforest,01a6b32c-ca42-4785-96cd-9038bb512177,0.0,0
4,clv_prediction,2018-05-30,randomforest,022be82d-26e5-41cf-9d65-f41691763420,0.0,0


### SAVE AS CSV AND/OR LOAD RESULTS INTO THE THE DATABASE

In [21]:
if save_as_csv:
    predictions.to_csv("scoring/results" + users_from + "-" + users_till, index=False)

In [22]:
if load_to_database:
    utils_bux.copy_to_database(predictions, 'db_table_name', conn)

In [None]:
# when running as a script
if __name__ == "__main__":
    users_from = sys.argv[1]
    users_till = sys.argv[2]
    # embed all the code above in the main function
    main(users_from, users_till)