### Import Libraries, declare variables

In [1]:
import os
import re
import google
from google.oauth2 import credentials
from google.oauth2 import service_account
from google.oauth2.service_account import Credentials
from datetime import date
from datetime import timedelta
from dateutil.relativedelta import relativedelta

import pandas as pd 
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder

# build model
import xgboost as xgb
from sklearn.metrics import roc_auc_score

SERVICE_TYPE = 'shs_invol_churn'
DATASET_ID = 'shs_invol_churn'
PROJECT_ID = 'divg-groovyhoon-pr-d2eab4' #mapping['PROJECT_ID']
RESOURCE_BUCKET = 'divg-groovyhoon-pr-d2eab4-default' #mapping['resources_bucket']
FILE_BUCKET = 'divg-groovyhoon-pr-d2eab4-default' #mapping['gcs_csv_bucket']
REGION = 'northamerica-northeast1' #mapping['REGION']
MODEL_ID = '9999'
FOLDER_NAME = 'telus_postpaid_churn'.format(MODEL_ID)
QUERIES_PATH = 'vertex_pipelines/' + FOLDER_NAME + '/queries/'
TABLE_ID = 'shs_invol_churn_data_final'

# scoringDate = date(2023, 9, 1)  # date.today() - relativedelta(days=2)- relativedelta(months=30)
# valScoringDate = date(2023, 11, 1)  # scoringDate - relativedelta(days=2)


### import bq to dataframe function

In [2]:
import pandas as pd 
import numpy as np 
from google.cloud import bigquery
from google.oauth2 import credentials

def import_bq_to_dataframe(project_id, dataset_id, table_id, client): 
    
    """
    Imports a specific table from BigQuery to a DataFrame. 
    
    Args: 
        project_id: The name of the project_id where the table is located.
        dataset_id: The name of the dataset_id where the table is located.
        table_id: The name of the table_id you wish to import to DataFrame.
        client: A BigQuery client instance. e.g. client = bigquery.Client(project=project_id).

    Returns: 
        A DataFrame
        
    Example: 
        import_bq_to_dataframe('bi-stg-divg-speech-pr-9d940b', 'call_to_retention_dataset', 'bq_ctr_pipeline_dataset')
        
    """
    
    sql = f"SELECT * FROM `{project_id}.{dataset_id}.{table_id}`"
    
    df_return = client.query(sql).to_dataframe()

    return df_return
    

### define get_lift function, import df_train and df_test from gcs bucket

In [3]:
import gc
import time
import pandas as pd
import numpy as np
import pickle
from google.cloud import storage
from google.cloud import bigquery
from sklearn.model_selection import train_test_split

project_id = PROJECT_ID
region = REGION
resource_bucket = RESOURCE_BUCKET
file_bucket = FILE_BUCKET
service_type=SERVICE_TYPE
project_id=PROJECT_ID
dataset_id=DATASET_ID
table_id = TABLE_ID

def get_lift(prob, y_test, q):
    result = pd.DataFrame(columns=['Prob', 'Churn'])
    result['Prob'] = prob
    result['Churn'] = y_test
    # result['Decile'] = pd.qcut(1-result['Prob'], 10, labels = False)
    result['Decile'] = pd.qcut(result['Prob'], q, labels=[i for i in range(q, 0, -1)])
    add = pd.DataFrame(result.groupby('Decile')['Churn'].mean()).reset_index()
    add.columns = ['Decile', 'avg_real_churn_rate']
    result = result.merge(add, on='Decile', how='left')
    result.sort_values('Decile', ascending=True, inplace=True)
    lg = pd.DataFrame(result.groupby('Decile')['Prob'].mean()).reset_index()
    lg.columns = ['Decile', 'avg_model_pred_churn_rate']
    lg.sort_values('Decile', ascending=False, inplace=True)
    lg['avg_churn_rate_total'] = result['Churn'].mean()
    lg = lg.merge(add, on='Decile', how='left')
    lg['lift'] = lg['avg_real_churn_rate'] / lg['avg_churn_rate_total']

    return lg

def get_gcp_bqclient(project_id, use_local_credential=True):
    token = os.popen('gcloud auth print-access-token').read()
    token = re.sub(f'\n$', '', token)
    credentials = google.oauth2.credentials.Credentials(token)

    bq_client = bigquery.Client(project=project_id)
    if use_local_credential:
        bq_client = bigquery.Client(project=project_id, credentials=credentials)
    return bq_client

client = get_gcp_bqclient(project_id)

df = import_bq_to_dataframe(project_id, dataset_id, table_id, client)

print(f'df: {df.shape}')


df: (796251, 189)


### add targets to df_train and df_target 

- df_target_train is from `divg-josh-pr-d1cc3a.tos_crosssell.bq_tos_cross_sell_targets_q3` 
- df_target_test is from `divg-josh-pr-d1cc3a.tos_crosssell.bq_tos_cross_sell_targets_q4` 
- some parts of the code and sql queries need to be dynamically adjusted to be included in the deploy model

In [4]:
# Define the start and end dates of the range
train_start_date = date(2023, 1, 1)
train_end_date = date(2023, 9, 30)

test_start_date = date(2023, 10, 1)
test_end_date = date(2024, 1, 1)

# Filter the DataFrame
df_train = df[(df['Base_Snapshot_Date'] >= train_start_date) & (df['Base_Snapshot_Date'] <= train_end_date)]
print(df_train.shape)

df_test = df[(df['Base_Snapshot_Date'] >= test_start_date) & (df['Base_Snapshot_Date'] <= test_end_date)]
print(df_test.shape)

#set up features (list)
cols_1 = df_train.columns.values
cols_2 = df_test.columns.values

cols = set(cols_1).intersection(set(cols_2))

features_to_exclude = ['CUST_ID', 'Bus_Billing_Account_Num', 'Bus_Prod_Instnc_Id', 
                       'Base_Snapshot_Date', 'Security_Origin', 'ACQUIRED_FROM', 
                       'Acquisition_Source', 'vol', 'invol', 'churn']

features = [f for f in cols if f not in features_to_exclude]

ban_train = df_train[['CUST_ID', 'Bus_Billing_Account_Num', 'Bus_Prod_Instnc_Id']]
X_train = df_train[features]
y_train = np.squeeze(df_train['invol'].values)

ban_test = df_test[['CUST_ID', 'Bus_Billing_Account_Num', 'Bus_Prod_Instnc_Id']]
X_test = df_test[features]
y_test = np.squeeze(df_test['invol'].values)


(496735, 189)
(299516, 189)


### preprocess

In [8]:
# Now we need to transform the features of the feature store.
def encode_categorical_features(df):
    # Get a list of all categorical columns
    cat_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()

    # Encode each categorical column
    for col in cat_columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        
    return df
    
X_train = encode_categorical_features(X_train)
X_test = encode_categorical_features(X_test)

#set up features (list)
cols_1 = X_train.columns.values
cols_2 = X_test.columns.values

cols = set(cols_1).intersection(set(cols_2))

features = [f for f in cols if f not in features_to_exclude]

X_train = X_train[features] 
X_test = X_test[features] 

print(X_train.columns) 

Index(['tot_disc_amt_2', 'census_subdivision_desc', 'HasT_Pre',
       'Channel_Category', 'tot_inv_amt_1', 'HasWHP_K', 'ttv_chrg_amt_3',
       'hsic_crdt_amt_2', 'bi_chnl_tag_cd', 'hsic_chrg_amt_2',
       ...
       'sing_disc_amt_l12m', 'sing_disc_amt_3', 'tot_net_amt_l12m', 'HasWHP_T',
       'HasNOS', 'HasLWC', 'CHNL_ORG_TXT__Custom_SQL_Query_',
       'Contracted__Account_', 'HasWHP', 'smhm_crdt_amt_1'],
      dtype='object', length=179)


### fit training data in xgboost classifier

In [9]:

# build model and fit in training data
import xgboost as xgb
from sklearn.metrics import roc_auc_score

xgb_model = xgb.XGBClassifier(
    learning_rate=0.05,
    n_estimators=250,
    max_depth=12,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
)

xgb_model.fit(X_train, y_train)
print('xgb training done')


xgb training done


### make predictions on X_test set, assign deciles to the predicted values, and save in df_test_exp

In [10]:
#predictions on X_test
y_pred = xgb_model.predict_proba(X_test)[:, 1]

#join ban_test, X_test, y_test and y_pred and print to csv
#CHECK THE SIZE OF EACH COMPONENT BEFORE JOINING
q=10
df_ban_test = ban_test
df_test_exp = df_ban_test.join(X_test) 
df_test_exp['y_test'] = y_test
df_test_exp['y_pred_proba'] = y_pred
df_test_exp['y_pred'] = (df_test_exp['y_pred_proba'] > 0.5).astype(int)
df_test_exp['decile'] = pd.qcut(df_test_exp['y_pred_proba'], q, labels=[i for i in range(q, 0, -1)])

lg = get_lift(y_pred, y_test, q)

lg

  add = pd.DataFrame(result.groupby('Decile')['Churn'].mean()).reset_index()
  lg = pd.DataFrame(result.groupby('Decile')['Prob'].mean()).reset_index()


Unnamed: 0,Decile,avg_model_pred_churn_rate,avg_churn_rate_total,avg_real_churn_rate,lift
0,1,0.111989,0.013909,0.125167,8.99892
1,2,0.006156,0.013909,0.009749,0.700926
2,3,0.002035,0.013909,0.002704,0.194429
3,4,0.001024,0.013909,0.000534,0.038407
4,5,0.000618,0.013909,0.000234,0.016802
5,6,0.000398,0.013909,0.000334,0.024004
6,7,0.00026,0.013909,6.7e-05,0.004801
7,8,0.000163,0.013909,0.0001,0.007201
8,9,8.9e-05,0.013909,0.0001,0.007201
9,10,3.6e-05,0.013909,0.0001,0.007201


In [17]:
# Get feature importances from xgboost model
importances = xgb_model.feature_importances_

# Get the index of importances from greatest importance to least
sorted_index = np.argsort(importances)[::-1]
x = range(len(importances))

feature_names = X_train.columns

# Create tick labels 
labels = np.array(feature_names)[sorted_index]

for idx in sorted_index: 
    print(f'{feature_names[idx]}, {importances[idx]}', end='\n')

Tenure_Month_Groups, 0.07723607867956161
Tenure_Month_Groups__All_, 0.07379674166440964
CREDIT_VALUE_CD__group_, 0.05787818878889084
Commitment_Type, 0.04177093133330345
smhm_chrg_amt_1, 0.029099594801664352
Contract_Type, 0.02799258753657341
price_plan, 0.01979736238718033
smhm_chrg_amt_2, 0.019004415720701218
tot_chrg_amt_1, 0.017116041854023933
Security_Plan_Group, 0.017021914944052696
Price_Plan_Txt__SHS_, 0.012706797569990158
PRICE_PLAN_RATE_AMT__Custom_SQL_Query_, 0.012491551227867603
other_chrg_amt_3, 0.012043689377605915
smhm_chrg_amt_3, 0.011646443046629429
smhm_net_amt_1, 0.011556265875697136
tot_inv_amt_1, 0.011354784481227398
tot_chrg_amt_2, 0.011181851848959923
prov_cd, 0.011034419760107994
Product_Profile, 0.010659968480467796
tot_chrg_amt_3, 0.01018329057842493
Province_Grp, 0.009880646131932735
other_chrg_amt_1, 0.009363624267280102
Security_Plan_Type, 0.009313363581895828
hsic_disc_amt_3, 0.009199962019920349
Tenure_Year_Group, 0.008927229791879654
other_disc_amt_3, 0.

### get feature importances from xgboost model

In [11]:
# Get feature importances from xgboost model
importances = xgb_model.feature_importances_

# Get the index of importances from greatest importance to least
sorted_index = np.argsort(importances)[::-1]
x = range(len(importances))

feature_names = X_train.columns

# Create tick labels 
labels = np.array(feature_names)[sorted_index]

In [12]:
labels

array(['Tenure_Month_Groups', 'Tenure_Month_Groups__All_',
       'CREDIT_VALUE_CD__group_', 'Commitment_Type', 'smhm_chrg_amt_1',
       'Contract_Type', 'price_plan', 'smhm_chrg_amt_2', 'tot_chrg_amt_1',
       'Security_Plan_Group', 'Price_Plan_Txt__SHS_',
       'PRICE_PLAN_RATE_AMT__Custom_SQL_Query_', 'other_chrg_amt_3',
       'smhm_chrg_amt_3', 'smhm_net_amt_1', 'tot_inv_amt_1',
       'tot_chrg_amt_2', 'prov_cd', 'Product_Profile', 'tot_chrg_amt_3',
       'Province_Grp', 'other_chrg_amt_1', 'Security_Plan_Type',
       'hsic_disc_amt_3', 'Tenure_Year_Group', 'other_disc_amt_3',
       'tot_net_amt_l12m', 'other_chrg_amt_2', 'tot_net_amt_1',
       'Contracted__Account_', 'tot_inv_amt_3', 'smhm_net_amt_3',
       'tot_inv_amt_2', 'HasHSIA', 'tot_net_amt_3', 'tot_inv_amt_l12m',
       'smhm_net_amt_2', 'Installation_Type', 'DIY_Activated',
       'other_chrg_amt_l12m', 'tot_chrg_amt_l12m', 'HSIA_Per_Cust',
       'tot_net_amt_2', 'avg_income', 'tot_tax_inv_amt_1',
       'smhm_

In [13]:
importances

array([0.00415478, 0.00389142, 0.        , 0.00397547, 0.01135478,
       0.        , 0.00071776, 0.00144649, 0.00442882, 0.00107116,
       0.        , 0.00444045, 0.00637958, 0.00138662, 0.01065997,
       0.04177093, 0.00080362, 0.0127068 , 0.00379207, 0.00290186,
       0.        , 0.00444225, 0.00323891, 0.00647826, 0.0040954 ,
       0.00936362, 0.00890808, 0.00413599, 0.        , 0.        ,
       0.00491609, 0.00988065, 0.        , 0.        , 0.        ,
       0.        , 0.0049945 , 0.00508218, 0.        , 0.        ,
       0.00172293, 0.00398711, 0.0035887 , 0.00446293, 0.0038846 ,
       0.00473872, 0.00546884, 0.        , 0.        , 0.00478376,
       0.        , 0.01900442, 0.0040364 , 0.00524162, 0.01018329,
       0.00058679, 0.00102801, 0.01103442, 0.00574714, 0.00856482,
       0.        , 0.00500985, 0.00586103, 0.00115569, 0.07723608,
       0.05787819, 0.00455286, 0.        , 0.00375153, 0.00458332,
       0.        , 0.00663642, 0.00533363, 0.01118185, 0.00220

In [14]:
np.argsort(importances)[::-1]

array([ 64, 133,  65,  15, 144, 120, 101,  51,  89,  90,  17, 100, 124,
       145, 163,   4,  73,  57,  14,  54,  31,  25,  77, 132, 162,  26,
       171,  59, 131, 176,  86,  78, 149,  93,  71,  23,  12, 127, 165,
        95, 141, 117,  62,  82,  58,  99, 112, 126,  46, 108,  72,  53,
       156,  37, 119,  61,  36, 105,  30, 116,  49,  45, 140,  91,  94,
       123,  69,  66, 164,  43,  21,  11,  75,   8, 150, 167, 138, 115,
       166,   0, 106,  27, 175,  24, 129,  52,  41,   3,   1,  44,  81,
       178,  18,  68, 154, 110,  97,  42, 136, 148, 103, 168,  79,  22,
        98, 151,  85,  96,  19, 130, 107, 128, 125, 111, 147, 146,  74,
        40, 122,   7, 118,  84, 152,  13,  76, 161, 160, 159,  63,   9,
        56, 114,  92,  16,   6,  55, 177,  20, 170, 169, 139, 172, 173,
        10, 174,  67,   5, 104, 102,   2,  70,  88,  28, 158,  87, 142,
       143, 113, 137,  50,  48,  47,  80,  60,  83, 135, 134,  39,  38,
       153, 121,  35,  34,  33,  32, 155, 157,  29, 109])

In [None]:
error

### Load results to BQ - WIP

In [None]:
SCORE_TABLE_ID = 'bq_telus_12_months_churn_scores'

project_id = PROJECT_ID 
dataset_id = DATASET_ID
score_table_id = SCORE_TABLE_ID

# get full score to cave into bucket
pred_prob = xgb_model.predict_proba(X_val, ntree_limit=xgb_model.best_iteration)[:, 1]
result = pd.DataFrame(columns=['ban', 'subscriber_no', 'score_date', 'y_true', 'y_pred'])

result['ban'] = list(ban_val['ban'])
result['ban'] = result['ban'].astype('str')

result['subscriber_no'] = list(ban_val['ban'])
result['subscriber_no'] = result['subscriber_no'].astype('str')

result['score_date'] = "2023-11-04"

result['y_true'] = list(y_val)
result['y_true'] = result['y_true'].fillna(0.0).astype('float64')

result['y_pred'] = list(pred_prob)
result['y_pred'] = result['y_pred'].fillna(0.0).astype('float64')

############# updated up to here ############

result.to_csv('gs://{}/{}/ucar/{}_prediction_v2.csv'.format(file_bucket, service_type, service_type), index=False)

# define dtype_bq_mapping
dtype_bq_mapping = {np.dtype('int64'): 'INTEGER', 
np.dtype('float64'):  'FLOAT', 
np.dtype('float32'):  'FLOAT', 
np.dtype('object'):  'STRING', 
np.dtype('bool'):  'BOOLEAN', 
np.dtype('datetime64[ns]'):  'DATE', 
pd.Int64Dtype(): 'INTEGER'} 

# export df_final to bigquery 
schema_list = [] 
for column in result.columns: 
    schema_list.append(bigquery.SchemaField(column, dtype_bq_mapping[result.dtypes[column]], mode='NULLABLE')) 
print(schema_list) 

dest_table = f'{project_id}.{dataset_id}.{score_table_id}'

# Sending to bigquery 
client = get_gcp_bqclient(project_id)
job_config = bigquery.LoadJobConfig(schema=schema_list, write_disposition='WRITE_TRUNCATE') 
job = client.load_table_from_dataframe(result, dest_table, job_config=job_config) 
job.result() 

table = client.get_table(dest_table) # Make an API request 
print("Loaded {} rows and {} columns to {}".format(table.num_rows, len(table.schema), table_id)) 

time.sleep(60)

# table_ref = f'{project_id}.{dataset_id}.{score_table}'
# client = bigquery.Client(project=project_id)
# table = client.get_table(table_ref)
# schema = table.schema

# ll = []
# for item in schema:
#     col = item.name
#     d_type = item.field_type
#     if 'float' in str(d_type).lower():
#         d_type = 'FLOAT64'
#     ll.append((col, d_type))

#     if 'integer' in str(d_type).lower():
#         result[col] = result[col].fillna(0).astype(int)
#     if 'float' in str(d_type).lower():
#         result[col] = result[col].fillna(0.0).astype(float)
#     if 'string' in str(d_type).lower():
#         result[col] = result[col].fillna('').astype(str)

# table_ref = '{}.{}.{}'.format(project_id, dataset_id, temp_table)
# client = bigquery.Client(project=project_id)
# if if_tbl_exists(client, table_ref):
#     client.delete_table(table_ref)

# client.create_table(table_ref)
# config = bigquery.LoadJobConfig(schema=schema)
# config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
# bq_table_instance = client.load_table_from_dataframe(result, table_ref, job_config=config)
# time.sleep(5)

# drop_sql = f"""delete from `{project_id}.{dataset_id}.{score_table}` where score_date = '{score_date_dash}'"""  # .format(project_id, dataset_id, score_date_dash)
# client.query(drop_sql)
# #
# load_sql = f"""insert into `{project_id}.{dataset_id}.{score_table}`
#               select * from `{project_id}.{dataset_id}.{temp_table}`"""
# client.query(load_sql)


In [None]:
error

### save the model in gcs

In [None]:
# save the model in GCS
from datetime import datetime
models_dict = {}
create_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
models_dict['create_time'] = create_time
models_dict['model'] = xgb_model
models_dict['features'] = features

with open('model_dict.pkl', 'wb') as handle:
    pickle.dump(models_dict, handle)
handle.close()

storage_client = storage.Client()
bucket = storage_client.get_bucket(file_bucket)

MODEL_PATH = '{}_xgb_models/'.format(service_type)
blob = bucket.blob(MODEL_PATH)
if not blob.exists(storage_client):
    blob.upload_from_string('')

model_name_onbkt = '{}{}_models_xgb_{}'.format(MODEL_PATH, service_type, models_dict['create_time'])
blob = bucket.blob(model_name_onbkt)
blob.upload_from_filename('model_dict.pkl')

print(f"....model loaded to GCS done at {str(create_time)}")

time.sleep(300)

### load the latest saved xgb_model to the environment

In [None]:
# MODEL_PATH = '{}_xgb_models/'.format(service_type)
# df_score = pd.read_csv('gs://{}/{}_score.csv.gz'.format(file_bucket, service_type), compression='gzip')
# df_score.dropna(subset=['ban'], inplace=True)
# df_score.reset_index(drop=True, inplace=True)
# print('......scoring data loaded:{}'.format(df_score.shape))
# time.sleep(10)
from google.cloud import bigquery
from google.cloud import storage

MODEL_PATH = '{}_xgb_models/'.format(service_type)

def load_model(file_bucket: str, service_type: str): 
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(file_bucket)
    blobs = storage_client.list_blobs(file_bucket, prefix='{}{}_models_xgb_'.format(MODEL_PATH, service_type))

    model_lists = []
    for blob in blobs:
        model_lists.append(blob.name)

    blob = bucket.blob(model_lists[-1])
    blob_in = blob.download_as_string()
    model_dict = pickle.loads(blob_in)
    xgb_model = model_dict['model']
    features = model_dict['features']
    print('...... model loaded')
    time.sleep(10)
    
    return xgb_model, features

xgb_model, features = load_model(file_bucket = FILE_BUCKET, service_type = SERVICE_TYPE) 

### backup codes 

In [None]:
def get_gcp_bqclient(project_id, use_local_credential=True):
    token = os.popen('gcloud auth print-access-token').read()
    token = re.sub(f'\n$', '', token)
    credentials = google.oauth2.credentials.Credentials(token)

    bq_client = bigquery.Client(project=project_id)
    if use_local_credential:
        bq_client = bigquery.Client(project=project_id, credentials=credentials)
    return bq_client

client = get_gcp_bqclient(project_id)

#instantiate df_target_train and df_target_test
sql_train = ''' SELECT * FROM `{}.{}.bq_tos_cross_sell_targets_q3` '''.format(project_id, dataset_id)
df_target_train = client.query(sql_train).to_dataframe()
df_target_train = df_target_train.loc[
    df_target_train['YEAR_MONTH'] == "2022-Q3"] #'-'.join(score_date_dash.split('-')[:2])]  # score_date_dash = '2022-08-31'

#set up df_train and df_test (add 'target')
df_target_train['ban'] = df_target_train['ban'].astype('int64')
df_target_train = df_target_train.groupby('ban').tail(1)

df_train = df_train.merge(df_target_train[['ban', 'product_crosssell_ind']], on='ban', how='left')
df_train.rename(columns={'product_crosssell_ind': 'target'}, inplace=True)
df_train.dropna(subset=['target'], inplace=True)
df_train['target'] = df_train['target'].astype(int)

df_train

In [None]:
sum(df_train['target'])

In [None]:
def get_gcp_bqclient(project_id, use_local_credential=True):
    token = os.popen('gcloud auth print-access-token').read()
    token = re.sub(f'\n$', '', token)
    credentials = google.oauth2.credentials.Credentials(token)

    bq_client = bigquery.Client(project=project_id)
    if use_local_credential:
        bq_client = bigquery.Client(project=project_id, credentials=credentials)
    return bq_client

client = get_gcp_bqclient(project_id)
sql_test = ''' SELECT * FROM `{}.{}.bq_tos_cross_sell_targets_q4` '''.format(project_id, dataset_id)
df_target_test = client.query(sql_test).to_dataframe()
df_target_test = df_target_test.loc[
    df_target_test['YEAR_MONTH'] == "2022-Q4"] #'-'.join(score_date_val_dash.split('-')[:2])]  # score_date_dash = '2022-09-30'

#set up df_train and df_test (add 'target')
df_target_test['ban'] = df_target_test['ban'].astype('int64')
df_target_test = df_target_test.groupby('ban').tail(1)

df_test = df_test.merge(df_target_test[['ban', 'product_crosssell_ind']], on='ban', how='left')
df_test.rename(columns={'product_crosssell_ind': 'target'}, inplace=True)
df_test.dropna(subset=['target'], inplace=True)
df_test['target'] = df_test['target'].astype(int)

df_test

In [None]:
sum(df_test['target'])