### Import Libraries, declare variables

In [1]:
import os
import re
import google
from google.oauth2 import credentials
from google.oauth2 import service_account
from google.oauth2.service_account import Credentials
from datetime import date
from datetime import timedelta
from dateutil.relativedelta import relativedelta

import pandas as pd 
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder

# build model
import xgboost as xgb
from sklearn.metrics import roc_auc_score

SERVICE_TYPE = 'shs_invol_churn'
DATASET_ID = 'shs_invol_churn'
PROJECT_ID = 'divg-groovyhoon-pr-d2eab4' #mapping['PROJECT_ID']
RESOURCE_BUCKET = 'divg-groovyhoon-pr-d2eab4-default' #mapping['resources_bucket']
FILE_BUCKET = 'divg-groovyhoon-pr-d2eab4-default' #mapping['gcs_csv_bucket']
REGION = 'northamerica-northeast1' #mapping['REGION']
MODEL_ID = '9999'
FOLDER_NAME = 'telus_postpaid_churn'.format(MODEL_ID)
QUERIES_PATH = 'vertex_pipelines/' + FOLDER_NAME + '/queries/'
TABLE_ID = 'shs_invol_churn_data_final'

# scoringDate = date(2023, 9, 1)  # date.today() - relativedelta(days=2)- relativedelta(months=30)
# valScoringDate = date(2023, 11, 1)  # scoringDate - relativedelta(days=2)


### import bq to dataframe function

In [2]:
import pandas as pd 
import numpy as np 
from google.cloud import bigquery
from google.oauth2 import credentials

def import_bq_to_dataframe(project_id, dataset_id, table_id, client): 
    
    """
    Imports a specific table from BigQuery to a DataFrame. 
    
    Args: 
        project_id: The name of the project_id where the table is located.
        dataset_id: The name of the dataset_id where the table is located.
        table_id: The name of the table_id you wish to import to DataFrame.
        client: A BigQuery client instance. e.g. client = bigquery.Client(project=project_id).

    Returns: 
        A DataFrame
        
    Example: 
        import_bq_to_dataframe('bi-stg-divg-speech-pr-9d940b', 'call_to_retention_dataset', 'bq_ctr_pipeline_dataset')
        
    """
    
    sql = f"SELECT * FROM `{project_id}.{dataset_id}.{table_id}`"
    
    df_return = client.query(sql).to_dataframe()

    return df_return
    

### define get_lift function, import df_train and df_test from gcs bucket

In [3]:
import gc
import time
import pandas as pd
import numpy as np
import pickle
from google.cloud import storage
from google.cloud import bigquery
from sklearn.model_selection import train_test_split

project_id = PROJECT_ID
region = REGION
resource_bucket = RESOURCE_BUCKET
file_bucket = FILE_BUCKET
service_type=SERVICE_TYPE
project_id=PROJECT_ID
dataset_id=DATASET_ID
table_id = TABLE_ID

def get_lift(prob, y_test, q):
    result = pd.DataFrame(columns=['Prob', 'Churn'])
    result['Prob'] = prob
    result['Churn'] = y_test
    # result['Decile'] = pd.qcut(1-result['Prob'], 10, labels = False)
    result['Decile'] = pd.qcut(result['Prob'], q, labels=[i for i in range(q, 0, -1)])
    add = pd.DataFrame(result.groupby('Decile')['Churn'].mean()).reset_index()
    add.columns = ['Decile', 'avg_real_churn_rate']
    result = result.merge(add, on='Decile', how='left')
    result.sort_values('Decile', ascending=True, inplace=True)
    lg = pd.DataFrame(result.groupby('Decile')['Prob'].mean()).reset_index()
    lg.columns = ['Decile', 'avg_model_pred_churn_rate']
    lg.sort_values('Decile', ascending=False, inplace=True)
    lg['avg_churn_rate_total'] = result['Churn'].mean()
    lg = lg.merge(add, on='Decile', how='left')
    lg['lift'] = lg['avg_real_churn_rate'] / lg['avg_churn_rate_total']

    return lg

def get_gcp_bqclient(project_id, use_local_credential=True):
    token = os.popen('gcloud auth print-access-token').read()
    token = re.sub(f'\n$', '', token)
    credentials = google.oauth2.credentials.Credentials(token)

    bq_client = bigquery.Client(project=project_id)
    if use_local_credential:
        bq_client = bigquery.Client(project=project_id, credentials=credentials)
    return bq_client

client = get_gcp_bqclient(project_id)

df = import_bq_to_dataframe(project_id, dataset_id, table_id, client)

print(f'df: {df.shape}')


df: (796102, 203)


### add targets to df_train and df_target 

- df_target_train is from `divg-josh-pr-d1cc3a.tos_crosssell.bq_tos_cross_sell_targets_q3` 
- df_target_test is from `divg-josh-pr-d1cc3a.tos_crosssell.bq_tos_cross_sell_targets_q4` 
- some parts of the code and sql queries need to be dynamically adjusted to be included in the deploy model

In [4]:
# Define the start and end dates of the range
train_start_date = date(2023, 1, 1)
train_end_date = date(2023, 9, 30)

test_start_date = date(2023, 10, 1)
test_end_date = date(2024, 1, 1)

# Filter the DataFrame
df_train = df[(df['Base_Snapshot_Date'] >= train_start_date) & (df['Base_Snapshot_Date'] <= train_end_date)]
print(df_train.shape)

df_test = df[(df['Base_Snapshot_Date'] >= test_start_date) & (df['Base_Snapshot_Date'] <= test_end_date)]
print(df_test.shape)

#set up features (list)
cols_1 = df_train.columns.values
cols_2 = df_test.columns.values

cols = set(cols_1).intersection(set(cols_2))

features_to_exclude = ['CUST_ID', 'Bus_Billing_Account_Num', 'Bus_Prod_Instnc_Id', 
                       'Base_Snapshot_Date', 'Security_Origin', 'ACQUIRED_FROM', 
                       'Acquisition_Source', 'vol', 'invol', 'churn']

features = [f for f in cols if f not in features_to_exclude]

ban_train = df_train[['CUST_ID', 'Bus_Billing_Account_Num', 'Bus_Prod_Instnc_Id']]
X_train = df_train[features]
y_train = np.squeeze(df_train['invol'].values)

ban_test = df_test[['CUST_ID', 'Bus_Billing_Account_Num', 'Bus_Prod_Instnc_Id']]
X_test = df_test[features]
y_test = np.squeeze(df_test['invol'].values)


(496618, 203)
(299484, 203)


### preprocess

In [5]:
# Now we need to transform the features of the feature store.
def encode_categorical_features(df):
    # Get a list of all categorical columns
    cat_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()

    # Encode each categorical column
    for col in cat_columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        
    return df
    
X_train = encode_categorical_features(X_train)
X_test = encode_categorical_features(X_test)

#set up features (list)
cols_1 = X_train.columns.values
cols_2 = X_test.columns.values

cols = set(cols_1).intersection(set(cols_2))

features = [f for f in cols if f not in features_to_exclude]

X_train = X_train[features] 
X_test = X_test[features] 

print(X_train.columns) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = le.fit_transform(df[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = le.fit_transform(df[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = le.fit_transform(df[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col

Index(['smhm_net_amt_1', 'tot_crdt_amt_l12m', 'smhm_chrg_amt_2',
       'hsic_disc_amt_2', 'smhm_crdt_amt_1', 'txn_sub_typ_txt',
       'other_chrg_amt_2', 'Commitment_Type', 'Tenure_Month_Groups',
       'ttv_disc_amt_l12m',
       ...
       'social_grp_nm', 'tot_chrg_amt_1', 'other_net_amt_2', 'Video_',
       'smhm_chrg_amt_l12m', 'hsic_disc_amt_1', 'tot_crdt_amt_1', 'avg_income',
       'HasLWC', 'hsic_chrg_amt_3'],
      dtype='object', length=193)


### fit training data in xgboost classifier

In [6]:

# build model and fit in training data
import xgboost as xgb
from sklearn.metrics import roc_auc_score

xgb_model = xgb.XGBClassifier(
    learning_rate=0.05,
    n_estimators=250,
    max_depth=12,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
)

xgb_model.fit(X_train, y_train)
print('xgb training done')


xgb training done


### make predictions on X_test set, assign deciles to the predicted values, and save in df_test_exp

In [16]:
#predictions on X_test
y_pred = xgb_model.predict_proba(X_test)[:, 1]

#join ban_test, X_test, y_test and y_pred and print to csv
#CHECK THE SIZE OF EACH COMPONENT BEFORE JOINING
q=100
df_ban_test = ban_test
df_test_exp = df_ban_test.join(X_test) 
df_test_exp['y_test'] = y_test
df_test_exp['y_pred_proba'] = y_pred
df_test_exp['y_pred'] = (df_test_exp['y_pred_proba'] > 0.5).astype(int)
df_test_exp['decile'] = pd.qcut(df_test_exp['y_pred_proba'], q, labels=[i for i in range(q, 0, -1)])

lg = get_lift(y_pred, y_test, q)

lg

  add = pd.DataFrame(result.groupby('Decile')['Churn'].mean()).reset_index()
  lg = pd.DataFrame(result.groupby('Decile')['Prob'].mean()).reset_index()


Unnamed: 0,Decile,avg_model_pred_churn_rate,avg_churn_rate_total,avg_real_churn_rate,lift
0,1,0.757091,0.013911,0.878464,63.150731
1,2,0.117372,0.013911,0.454424,32.667482
2,3,0.003406,0.013911,0.054090,3.888414
3,4,0.000335,0.013911,0.002337,0.168018
4,5,0.000094,0.013911,0.001336,0.096010
...,...,...,...,...,...
95,96,0.000004,0.013911,0.000000,0.000000
96,97,0.000004,0.013911,0.000000,0.000000
97,98,0.000003,0.013911,0.000000,0.000000
98,99,0.000003,0.013911,0.000000,0.000000


In [8]:
# Get feature importances from xgboost model
importances = xgb_model.feature_importances_

# Get the index of importances from greatest importance to least
sorted_index = np.argsort(importances)[::-1]
x = range(len(importances))

feature_names = X_train.columns

# Create tick labels 
labels = np.array(feature_names)[sorted_index]

for idx in sorted_index: 
    print(f'{feature_names[idx]}, {importances[idx]}', end='\n')

Segment, 0.4673186242580414
Best_practices, 0.08102121949195862
yr_mth_1, 0.04818199947476387
hsic_chrg_amt_3, 0.036004193127155304
yr_mth_3, 0.03210742026567459
Login_consistency, 0.01729782484471798
Arming_consistency, 0.016159120947122574
Tenure_Month_Groups__All_, 0.015361352823674679
CREDIT_VALUE_CD__group_, 0.014392083510756493
Tenure_Month_Groups, 0.013182554394006729
other_disc_amt_1, 0.012951711192727089
other_disc_amt_3, 0.00999209564179182
Security_Plan_Type, 0.008264691568911076
other_disc_amt_l12m, 0.007997366599738598
hsic_disc_amt_l12m, 0.007878735661506653
Commitment_Type, 0.0076604015193879604
smhm_net_amt_3, 0.007567053660750389
Security_Plan_Group, 0.006887315306812525
TOS_FLAG, 0.006080618593841791
Price_Plan_Txt__SHS_, 0.005969710648059845
smhm_chrg_amt_2, 0.005469792988151312
Contract_Type, 0.004792660940438509
tot_chrg_amt_1, 0.004237063694745302
tot_net_amt_l12m, 0.0040846155025064945
hsic_chrg_amt_1, 0.004029497969895601
HasTOS, 0.004003380425274372
other_disc_

### get feature importances from xgboost model

In [9]:
# Get feature importances from xgboost model
importances = xgb_model.feature_importances_

# Get the index of importances from greatest importance to least
sorted_index = np.argsort(importances)[::-1]
x = range(len(importances))

feature_names = X_train.columns

# Create tick labels 
labels = np.array(feature_names)[sorted_index]

In [10]:
labels

array(['Segment', 'Best_practices', 'yr_mth_1', 'hsic_chrg_amt_3',
       'yr_mth_3', 'Login_consistency', 'Arming_consistency',
       'Tenure_Month_Groups__All_', 'CREDIT_VALUE_CD__group_',
       'Tenure_Month_Groups', 'other_disc_amt_1', 'other_disc_amt_3',
       'Security_Plan_Type', 'other_disc_amt_l12m', 'hsic_disc_amt_l12m',
       'Commitment_Type', 'smhm_net_amt_3', 'Security_Plan_Group',
       'TOS_FLAG', 'Price_Plan_Txt__SHS_', 'smhm_chrg_amt_2',
       'Contract_Type', 'tot_chrg_amt_1', 'tot_net_amt_l12m',
       'hsic_chrg_amt_1', 'HasTOS', 'other_disc_amt_2', 'smhm_chrg_amt_1',
       'hsic_chrg_amt_l12m', 'Tenure_Year_Group',
       'PRICE_PLAN_TXT__Custom_SQL_Query_', 'smhm_chrg_amt_3',
       'tot_chrg_amt_2', 'tot_inv_amt_l12m', 'price_plan',
       'tot_chrg_amt_3', 'txn_sub_typ_txt', 'other_chrg_amt_3',
       'tot_net_amt_1', 'Cultural_Segment', 'tot_inv_amt_1',
       'tot_inv_amt_3', 'Contracted__Account_', 'other_net_amt_2',
       'hsic_net_amt_l12m', 'other

In [11]:
importances

array([1.77138217e-03, 1.14177854e-03, 5.46979299e-03, 1.07963406e-03,
       6.56650227e-04, 2.37801648e-03, 1.48344098e-03, 7.66040152e-03,
       1.31825544e-02, 0.00000000e+00, 0.00000000e+00, 3.53710493e-04,
       7.87873566e-03, 4.67318624e-01, 4.45608137e-04, 0.00000000e+00,
       8.10212195e-02, 9.46621527e-04, 6.68089488e-04, 1.21795293e-03,
       5.90642798e-04, 0.00000000e+00, 1.27781183e-03, 6.17902086e-04,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       1.10085157e-03, 0.00000000e+00, 0.00000000e+00, 9.41378006e-04,
       6.88731531e-03, 7.56705366e-03, 9.09504481e-04, 3.81125393e-03,
       2.83846934e-03, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       3.86614259e-03, 1.02035026e-03, 3.21074203e-02, 1.90844981e-03,
       0.00000000e+00, 3.92699847e-03, 0.00000000e+00, 1.17481360e-03,
       1.32363394e-03, 9.68020526e-04, 0.00000000e+00, 3.56568053e-04,
       0.00000000e+00, 8.97926686e-04, 0.00000000e+00, 1.32461230e-03,
      

In [12]:
np.argsort(importances)[::-1]

array([ 13,  16,  58, 192,  42, 147, 140, 163, 102,   8, 120, 180,  87,
       100,  12,   7,  33,  32,  69,  93,   2, 122, 184, 154,  96, 142,
        45,  40,  35, 115, 114, 166,  36,  66, 113, 134,   5,  83, 149,
        80,  94, 112, 137, 185, 123,  76,  43, 153,  86,   0, 128, 181,
        70, 127,  79, 129,  65, 138,  77,   6, 178, 117, 156, 173, 103,
        75, 106, 161,  55,  48, 186,  22, 187, 171, 189, 190, 132,  19,
       130,  47,  67,   1, 175,  60, 143, 105,  28,  71, 125, 162,   3,
       109, 174,  57,  91,  62,  64,  61,  41,  84, 152, 176,  49, 110,
        85,  17,  31, 158,  95,  92, 146,  89,  34, 168,  53,  88, 183,
        74, 155, 116, 119,  18,   4,  23, 170,  20,  14, 145, 104,  97,
        51,  11, 133,  59, 150, 179,  27, 139, 182, 144, 165, 136,  30,
        29, 135, 131,  26, 188, 164,  24,   9,  10, 126,  21,  15,  25,
        52,  37,  38,  73, 151, 157, 111,  78,  81,  82, 169, 177,  90,
       191,  98,  99, 108, 101,  72, 141, 148, 172,  39, 124, 16

In [13]:
error

NameError: name 'error' is not defined

### Load results to BQ - WIP

In [None]:
SCORE_TABLE_ID = 'bq_telus_12_months_churn_scores'

project_id = PROJECT_ID 
dataset_id = DATASET_ID
score_table_id = SCORE_TABLE_ID

# get full score to cave into bucket
pred_prob = xgb_model.predict_proba(X_val, ntree_limit=xgb_model.best_iteration)[:, 1]
result = pd.DataFrame(columns=['ban', 'subscriber_no', 'score_date', 'y_true', 'y_pred'])

result['ban'] = list(ban_val['ban'])
result['ban'] = result['ban'].astype('str')

result['subscriber_no'] = list(ban_val['ban'])
result['subscriber_no'] = result['subscriber_no'].astype('str')

result['score_date'] = "2023-11-04"

result['y_true'] = list(y_val)
result['y_true'] = result['y_true'].fillna(0.0).astype('float64')

result['y_pred'] = list(pred_prob)
result['y_pred'] = result['y_pred'].fillna(0.0).astype('float64')

############# updated up to here ############

result.to_csv('gs://{}/{}/ucar/{}_prediction_v2.csv'.format(file_bucket, service_type, service_type), index=False)

# define dtype_bq_mapping
dtype_bq_mapping = {np.dtype('int64'): 'INTEGER', 
np.dtype('float64'):  'FLOAT', 
np.dtype('float32'):  'FLOAT', 
np.dtype('object'):  'STRING', 
np.dtype('bool'):  'BOOLEAN', 
np.dtype('datetime64[ns]'):  'DATE', 
pd.Int64Dtype(): 'INTEGER'} 

# export df_final to bigquery 
schema_list = [] 
for column in result.columns: 
    schema_list.append(bigquery.SchemaField(column, dtype_bq_mapping[result.dtypes[column]], mode='NULLABLE')) 
print(schema_list) 

dest_table = f'{project_id}.{dataset_id}.{score_table_id}'

# Sending to bigquery 
client = get_gcp_bqclient(project_id)
job_config = bigquery.LoadJobConfig(schema=schema_list, write_disposition='WRITE_TRUNCATE') 
job = client.load_table_from_dataframe(result, dest_table, job_config=job_config) 
job.result() 

table = client.get_table(dest_table) # Make an API request 
print("Loaded {} rows and {} columns to {}".format(table.num_rows, len(table.schema), table_id)) 

time.sleep(60)

# table_ref = f'{project_id}.{dataset_id}.{score_table}'
# client = bigquery.Client(project=project_id)
# table = client.get_table(table_ref)
# schema = table.schema

# ll = []
# for item in schema:
#     col = item.name
#     d_type = item.field_type
#     if 'float' in str(d_type).lower():
#         d_type = 'FLOAT64'
#     ll.append((col, d_type))

#     if 'integer' in str(d_type).lower():
#         result[col] = result[col].fillna(0).astype(int)
#     if 'float' in str(d_type).lower():
#         result[col] = result[col].fillna(0.0).astype(float)
#     if 'string' in str(d_type).lower():
#         result[col] = result[col].fillna('').astype(str)

# table_ref = '{}.{}.{}'.format(project_id, dataset_id, temp_table)
# client = bigquery.Client(project=project_id)
# if if_tbl_exists(client, table_ref):
#     client.delete_table(table_ref)

# client.create_table(table_ref)
# config = bigquery.LoadJobConfig(schema=schema)
# config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
# bq_table_instance = client.load_table_from_dataframe(result, table_ref, job_config=config)
# time.sleep(5)

# drop_sql = f"""delete from `{project_id}.{dataset_id}.{score_table}` where score_date = '{score_date_dash}'"""  # .format(project_id, dataset_id, score_date_dash)
# client.query(drop_sql)
# #
# load_sql = f"""insert into `{project_id}.{dataset_id}.{score_table}`
#               select * from `{project_id}.{dataset_id}.{temp_table}`"""
# client.query(load_sql)


In [None]:
error

### save the model in gcs

In [None]:
# save the model in GCS
from datetime import datetime
models_dict = {}
create_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
models_dict['create_time'] = create_time
models_dict['model'] = xgb_model
models_dict['features'] = features

with open('model_dict.pkl', 'wb') as handle:
    pickle.dump(models_dict, handle)
handle.close()

storage_client = storage.Client()
bucket = storage_client.get_bucket(file_bucket)

MODEL_PATH = '{}_xgb_models/'.format(service_type)
blob = bucket.blob(MODEL_PATH)
if not blob.exists(storage_client):
    blob.upload_from_string('')

model_name_onbkt = '{}{}_models_xgb_{}'.format(MODEL_PATH, service_type, models_dict['create_time'])
blob = bucket.blob(model_name_onbkt)
blob.upload_from_filename('model_dict.pkl')

print(f"....model loaded to GCS done at {str(create_time)}")

time.sleep(300)

### load the latest saved xgb_model to the environment

In [None]:
# MODEL_PATH = '{}_xgb_models/'.format(service_type)
# df_score = pd.read_csv('gs://{}/{}_score.csv.gz'.format(file_bucket, service_type), compression='gzip')
# df_score.dropna(subset=['ban'], inplace=True)
# df_score.reset_index(drop=True, inplace=True)
# print('......scoring data loaded:{}'.format(df_score.shape))
# time.sleep(10)
from google.cloud import bigquery
from google.cloud import storage

MODEL_PATH = '{}_xgb_models/'.format(service_type)

def load_model(file_bucket: str, service_type: str): 
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(file_bucket)
    blobs = storage_client.list_blobs(file_bucket, prefix='{}{}_models_xgb_'.format(MODEL_PATH, service_type))

    model_lists = []
    for blob in blobs:
        model_lists.append(blob.name)

    blob = bucket.blob(model_lists[-1])
    blob_in = blob.download_as_string()
    model_dict = pickle.loads(blob_in)
    xgb_model = model_dict['model']
    features = model_dict['features']
    print('...... model loaded')
    time.sleep(10)
    
    return xgb_model, features

xgb_model, features = load_model(file_bucket = FILE_BUCKET, service_type = SERVICE_TYPE) 

### backup codes 

In [None]:
def get_gcp_bqclient(project_id, use_local_credential=True):
    token = os.popen('gcloud auth print-access-token').read()
    token = re.sub(f'\n$', '', token)
    credentials = google.oauth2.credentials.Credentials(token)

    bq_client = bigquery.Client(project=project_id)
    if use_local_credential:
        bq_client = bigquery.Client(project=project_id, credentials=credentials)
    return bq_client

client = get_gcp_bqclient(project_id)

#instantiate df_target_train and df_target_test
sql_train = ''' SELECT * FROM `{}.{}.bq_tos_cross_sell_targets_q3` '''.format(project_id, dataset_id)
df_target_train = client.query(sql_train).to_dataframe()
df_target_train = df_target_train.loc[
    df_target_train['YEAR_MONTH'] == "2022-Q3"] #'-'.join(score_date_dash.split('-')[:2])]  # score_date_dash = '2022-08-31'

#set up df_train and df_test (add 'target')
df_target_train['ban'] = df_target_train['ban'].astype('int64')
df_target_train = df_target_train.groupby('ban').tail(1)

df_train = df_train.merge(df_target_train[['ban', 'product_crosssell_ind']], on='ban', how='left')
df_train.rename(columns={'product_crosssell_ind': 'target'}, inplace=True)
df_train.dropna(subset=['target'], inplace=True)
df_train['target'] = df_train['target'].astype(int)

df_train

In [None]:
sum(df_train['target'])

In [None]:
def get_gcp_bqclient(project_id, use_local_credential=True):
    token = os.popen('gcloud auth print-access-token').read()
    token = re.sub(f'\n$', '', token)
    credentials = google.oauth2.credentials.Credentials(token)

    bq_client = bigquery.Client(project=project_id)
    if use_local_credential:
        bq_client = bigquery.Client(project=project_id, credentials=credentials)
    return bq_client

client = get_gcp_bqclient(project_id)
sql_test = ''' SELECT * FROM `{}.{}.bq_tos_cross_sell_targets_q4` '''.format(project_id, dataset_id)
df_target_test = client.query(sql_test).to_dataframe()
df_target_test = df_target_test.loc[
    df_target_test['YEAR_MONTH'] == "2022-Q4"] #'-'.join(score_date_val_dash.split('-')[:2])]  # score_date_dash = '2022-09-30'

#set up df_train and df_test (add 'target')
df_target_test['ban'] = df_target_test['ban'].astype('int64')
df_target_test = df_target_test.groupby('ban').tail(1)

df_test = df_test.merge(df_target_test[['ban', 'product_crosssell_ind']], on='ban', how='left')
df_test.rename(columns={'product_crosssell_ind': 'target'}, inplace=True)
df_test.dropna(subset=['target'], inplace=True)
df_test['target'] = df_test['target'].astype(int)

df_test

In [None]:
sum(df_test['target'])