In [None]:
# import required libraries
import kfp
from kfp import dsl
from kfp.v2 import compiler
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output, OutputPath, ClassificationMetrics,
                        Metrics, component)
import os
import re
from pathlib import Path

from datetime import date
from datetime import timedelta
from dateutil.relativedelta import relativedelta

import google
from google.oauth2 import credentials
from google.oauth2 import service_account
from google.oauth2.service_account import Credentials
from google.cloud import storage
from google.cloud.aiplatform import pipeline_jobs
from google_cloud_pipeline_components.v1.batch_predict_job import \
    ModelBatchPredictOp as batch_prediction_op


import gc
import time
import pandas as pd
import numpy as np
import pickle
from google.cloud import storage
from google.cloud import bigquery
from sklearn.model_selection import train_test_split


In [None]:
#tag cell with parameters
PROJECT_ID =  'divg-josh-pr-d1cc3a'
BUCKET_NAME='divg-josh-pr-d1cc3a-default'
DATASET_ID = 'telus_rewards'
RESOURCE_BUCKET = 'divg-josh-pr-d1cc3a-default'
FILE_BUCKET = 'divg-josh-pr-d1cc3a-default'
MODEL_ID = '5080'
SERVICE_TYPE = 'telus_rewards'
SERVICE_TYPE_NAME = 'telus-rewards'
TABLE_ID = 'telus_rwrd_redemption_targets'
REGION = 'northamerica-northeast1'
TRAINING_DATASET_TABLE_NAME = 'bq_telus_rewards_pipeline_dataset'
TRAINING_DATASET_SP_NAME = 'bq_sp_telus_rewards_pipeline_dataset'

scoringDate = date(2022, 9, 1)  # date.today() - relativedelta(days=2)- relativedelta(months=30)
valScoringDate = date(2022, 10, 1)  # scoringDate - relativedelta(days=2)

# training dates
SCORE_DATE = scoringDate.strftime('%Y%m%d')  # date.today().strftime('%Y%m%d')
SCORE_DATE_DASH = scoringDate.strftime('%Y-%m-%d')
SCORE_DATE_MINUS_6_MOS_DASH = ((scoringDate - relativedelta(months=6)).replace(day=1)).strftime('%Y-%m-%d')
SCORE_DATE_LAST_MONTH_START_DASH = (scoringDate.replace(day=1) - timedelta(days=1)).replace(day=1).strftime('%Y-%m-%d')
SCORE_DATE_LAST_MONTH_END_DASH = ((scoringDate.replace(day=1)) - timedelta(days=1)).strftime('%Y-%m-%d')
PROMO_EXPIRY_START = (scoringDate.replace(day=1) + relativedelta(months=3)).replace(day=1).strftime('%Y-%m-%d')
PROMO_EXPIRY_END = (scoringDate.replace(day=1) + relativedelta(months=4)).replace(day=1).strftime('%Y-%m-%d')

# validation dates
SCORE_DATE_VAL = valScoringDate.strftime('%Y%m%d')
SCORE_DATE_VAL_DASH = valScoringDate.strftime('%Y-%m-%d')
SCORE_DATE_VAL_MINUS_6_MOS_DASH = ((valScoringDate - relativedelta(months=6)).replace(day=1)).strftime('%Y-%m-%d')
SCORE_DATE_VAL_LAST_MONTH_START_DASH = (valScoringDate.replace(day=1) - timedelta(days=1)).replace(day=1).strftime('%Y-%m-%d')
SCORE_DATE_VAL_LAST_MONTH_END_DASH = ((valScoringDate.replace(day=1)) - timedelta(days=1)).strftime('%Y-%m-%d')
PROMO_EXPIRY_START_VAL = (valScoringDate.replace(day=1) + relativedelta(months=3)).replace(day=1).strftime('%Y-%m-%d')
PROMO_EXPIRY_END_VAL = (valScoringDate.replace(day=1) + relativedelta(months=4)).replace(day=1).strftime('%Y-%m-%d')

SCORE_DATE_DELTA = 0
SCORE_DATE_VAL_DELTA = 0
TICKET_DATE_WINDOW = 30  # Days of ticket data to be queried


In [None]:
file_bucket = FILE_BUCKET
service_type = SERVICE_TYPE
score_date_dash = SCORE_DATE_DASH
score_date_val_dash = SCORE_DATE_VAL_DASH
project_id = PROJECT_ID 
dataset_id = DATASET_ID

In [None]:

def get_lift(prob, y_test, q):
    result = pd.DataFrame(columns=['Prob', 'Redemption'])
    result['Prob'] = prob
    result['Redemption'] = y_test
    result['Decile'] = pd.qcut(result['Prob'], q, labels=[i for i in range(q, 0, -1)])
    add = pd.DataFrame(result.groupby('Decile')['Redemption'].mean()).reset_index()
    add.columns = ['Decile', 'avg_real_redemption_rate']
    result = result.merge(add, on='Decile', how='left')
    result.sort_values('Decile', ascending=True, inplace=True)
    lg = pd.DataFrame(result.groupby('Decile')['Prob'].mean()).reset_index()
    lg.columns = ['Decile', 'avg_model_pred_redemption_rate']
    lg.sort_values('Decile', ascending=False, inplace=True)
    lg['avg_redemption_rate_total'] = result['Redemption'].mean()
    lg = lg.merge(add, on='Decile', how='left')
    lg['lift'] = lg['avg_real_redemption_rate'] / lg['avg_redemption_rate_total']

    return lg    

df_train = pd.read_csv('gs://{}/{}_train.csv.gz'.format(file_bucket, service_type),
                       compression='gzip')  
df_test = pd.read_csv('gs://{}/{}_validation.csv.gz'.format(file_bucket, service_type),  
                      compression='gzip')

#set up df_train
client = bigquery.Client(project=project_id)
sql_train = ''' SELECT * FROM `{}.{}.bq_telus_rwrd_redemption_targets` '''.format(project_id, dataset_id) 
df_target_train = client.query(sql_train).to_dataframe()
df_target_train = df_target_train.loc[
    df_target_train['YEAR_MONTH'] == '-'.join(score_date_dash.split('-')[:2])]  # score_date_dash = '2022-08-31'
df_target_train['ban'] = df_target_train['ban'].astype('int64')
df_target_train = df_target_train.groupby('ban').tail(1)
df_train = df_train.merge(df_target_train[['ban', 'target_ind']], on='ban', how='left')
df_train.rename(columns={'target_ind': 'target'}, inplace=True)
# df_train.dropna(subset=['target'], inplace=True)
df_train.fillna(0, inplace=True)
df_train['target'] = df_train['target'].astype(int)
print(df_train.shape)

In [None]:
#set up df_test
sql_test = ''' SELECT * FROM `{}.{}.bq_telus_rwrd_redemption_targets` '''.format(project_id, dataset_id) 
df_target_test = client.query(sql_test).to_dataframe()
df_target_test = df_target_test.loc[
    df_target_test['YEAR_MONTH'] == '-'.join(score_date_val_dash.split('-')[:2])]  # score_date_dash = '2022-09-30'
df_target_test['ban'] = df_target_test['ban'].astype('int64')
df_target_test = df_target_test.groupby('ban').tail(1)
df_test = df_test.merge(df_target_test[['ban', 'target_ind']], on='ban', how='left')
df_test.rename(columns={'target_ind': 'target'}, inplace=True)
# df_test.dropna(subset=['target'], inplace=True)
df_test.fillna(0, inplace=True) 
df_test['target'] = df_test['target'].astype(int)
print(df_test.shape)


In [None]:
df_train.to_csv('gs://{}/telus_rewards_train.csv'.format(file_bucket))  

In [None]:
df_train['target']

In [None]:
df_test

###### 