# User Authentification


In [1]:
from google.colab import auth
auth.authenticate_user()
print('Authenticated')

Authenticated



# Getting the right Libraries and Project ID


Make sure I'm using latest version of BigQuery Python lib



In [0]:
!pip install --upgrade google-cloud-bigquery

Import BigQuery Python Client Library and initialize client with correct Projectid

In [0]:
project_id = 'dev-d2c-engagement-scoring'
from google.cloud import bigquery
client = bigquery.Client(project=project_id)

# Create Model

Here is where we define the shape and parameters of the model. I comment the different components with options to consider.


In [7]:
client.query('''
# MODEL  has the shape of <dataset>.<modelname> 
# where <dataset> should already exist.

CREATE OR REPLACE MODEL `Lancome_MX_d2c_engagement_scoring.sample_model`

# Transform clasue enables doing data preprocessing as well as 
# feature selection. Whatever preprocessing defind in this stage will also
# be applied to new data when using the PREDICT module.

  TRANSFORM( 
    total_transactions, 
    # EXAMPLE OF STANDARD SCALER
    ML.STANDARD_SCALER(total_add_to_carts) OVER() as scaled_add_to_carts,	
    ML.STANDARD_SCALER(total_pageviews_over_3) OVER() as scaled_pageviews_over_3,	
    ML.STANDARD_SCALER(total_product_views) OVER() as scaled_product_views,	

    # EXAMPLE OF MIN MAX SCALER
    ML.MIN_MAX_SCALER(total_session_time_over_120s) OVER() as scaled_session_time_over_120s,	

    # EXAMPLE OF NON TRANSFORMED FEATURES
    mobile_session_share,
    desktop_session_share,	
    tablet_session_share,	
    traffic_share_from_mexico,	
    avg_bounce_rate2,	
    remarketing_session_share,	
    avg_product_revenue,
    avg_order_value,
    avg_local_order_value,
    avg_session_quality
  )

# In the following OPTIONS section we define model arguments

  OPTIONS(
    # Model can be LINEAR_REG, LOGISTIC_REG, 'KMEANS'
    model_type='LINEAR_REG', 
    # The maximum number of training iterations or steps.
    max_iteration=50, 
    # Learning Rate to apply
    ls_init_learn_rate=.15,
    # Here we choose a regularization method, we can try with L2 as to nor 
    # drive any coefficient to 0 initially.
    l2_reg=1,
    # Options for data split are 'AUTO_SPLIT', 'RANDOM', 'SEQ' or 'CUSTOM'
    # We will eventually move to 'SEQ' adding timestamp to evaluate performance
    # over different periods.
    data_split_method = 'RANDOM',
    # Difine KPI to model
    input_label_cols=['total_transactions']) AS

SELECT
  *
FROM
  `Lancome_MX_d2c_engagement_scoring.mx_lancome_d2csignals_last14days`

# Ideally here we would add a WHERE clause to filter out some instances 
# and leave them for model evaluation.

''')

<google.cloud.bigquery.job.QueryJob at 0x7f6714ee51d0>

# Results of Model Training

Outputs for linear regression will include:

*   mean_absolute_error
*   mean_squared_error
*   mean_squared_log_error
*   median_absolute_error
*   r2_score
*   explained_variance



In [8]:
training_result = client.query('''SELECT *
  
  FROM ML.TRAINING_INFO(MODEL `Lancome_MX_d2c_engagement_scoring.sample_model`)''').to_dataframe()

print(training_result)

   training_run  iteration      loss  eval_loss  learning_rate  duration_ms
0             0          0  8.932947  27.182758           0.15         2159


# Results on New Data

Results of Model Evaluation on completely unseen Data

In [0]:
evaluation_result = client.query(''' SELECT
  *
FROM ML.EVALUATE(MODEL `Lancome_MX_d2c_engagement_scoring.sample_model`, (
  SELECT
  *
  FROM
  `Lancome_MX_d2c_engagement_scoring.mx_lancome_d2csignals_last14days`
  # Here we would select all rows filtered from CREATE MODEL
  WHERE  'some filter clause') ''').to_dataframe()

print(evaluation_result)