### import required libraries

In [None]:
import os
import re
import google
from google.oauth2 import credentials
from google.oauth2 import service_account
from google.oauth2.service_account import Credentials
from datetime import date
from datetime import timedelta
from dateutil.relativedelta import relativedelta

import pandas as pd 
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder

# first neural network with keras tutorial
from numpy import loadtxt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from scikeras.wrappers import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline

SERVICE_TYPE = 'nba_product_reco_model'
DATASET_ID = 'nba_product_reco_model'
PROJECT_ID = 'divg-groovyhoon-pr-d2eab4' #mapping['PROJECT_ID']
RESOURCE_BUCKET = 'divg-groovyhoon-pr-d2eab4-default' #mapping['resources_bucket']
FILE_BUCKET = 'divg-groovyhoon-pr-d2eab4-default' #mapping['gcs_csv_bucket']
REGION = 'northamerica-northeast1' #mapping['REGION']
MODEL_ID = '9999'
FOLDER_NAME = 'nba_product_reco_model'.format(MODEL_ID)
QUERIES_PATH = 'vertex_pipelines/' + FOLDER_NAME + '/queries/'
TRAIN_TABLE_ID = 'nba_training_dataset_v7'
VAL_TABLE_ID = 'nba_val_dataset_v7'
TEST_TABLE_ID = 'nba_test_dataset_v7'
SCORE_TABLE_ID = 'bq_product_reco_scores'


In [None]:
%load_ext autoreload
%autoreload 2

### import bq to dataframe function

In [None]:
import pandas as pd 
import numpy as np 
from google.cloud import bigquery
from google.oauth2 import credentials

def import_bq_to_dataframe(project_id, dataset_id, table_id, client): 
    
    """
    Imports a specific table from BigQuery to a DataFrame. 
    
    Args: 
        project_id: The name of the project_id where the table is located.
        dataset_id: The name of the dataset_id where the table is located.
        table_id: The name of the table_id you wish to import to DataFrame.
        client: A BigQuery client instance. e.g. client = bigquery.Client(project=project_id).

    Returns: 
        A DataFrame
        
    Example: 
        import_bq_to_dataframe('bi-stg-divg-speech-pr-9d940b', 'call_to_retention_dataset', 'bq_ctr_pipeline_dataset')
        
    """
    
    sql = f"SELECT * FROM `{project_id}.{dataset_id}.{table_id}`"
    
    df_return = client.query(sql).to_dataframe()

    return df_return
    

### define get_lift function, import df_train and df_test from gcs bucket

In [None]:
import gc
import time
import pandas as pd
import numpy as np
import pickle
from google.cloud import storage
from google.cloud import bigquery
from sklearn.model_selection import train_test_split

project_id = PROJECT_ID
region = REGION
resource_bucket = RESOURCE_BUCKET
file_bucket = FILE_BUCKET
service_type=SERVICE_TYPE
project_id=PROJECT_ID
dataset_id=DATASET_ID
train_table_id = TRAIN_TABLE_ID
val_table_id = VAL_TABLE_ID
test_table_id = TEST_TABLE_ID

def get_lift(prob, y_test, q):
    result = pd.DataFrame(columns=['Prob', 'Churn'])
    result['Prob'] = prob
    result['Churn'] = y_test
    # result['Decile'] = pd.qcut(1-result['Prob'], 10, labels = False)
    result['Decile'] = pd.qcut(result['Prob'], q, labels=[i for i in range(q, 0, -1)])
    add = pd.DataFrame(result.groupby('Decile')['Churn'].mean()).reset_index()
    add.columns = ['Decile', 'avg_real_churn_rate']
    result = result.merge(add, on='Decile', how='left')
    result.sort_values('Decile', ascending=True, inplace=True)
    lg = pd.DataFrame(result.groupby('Decile')['Prob'].mean()).reset_index()
    lg.columns = ['Decile', 'avg_model_pred_churn_rate']
    lg.sort_values('Decile', ascending=False, inplace=True)
    lg['avg_churn_rate_total'] = result['Churn'].mean()
    lg = lg.merge(add, on='Decile', how='left')
    lg['lift'] = lg['avg_real_churn_rate'] / lg['avg_churn_rate_total']

    return lg

def get_gcp_bqclient(project_id, use_local_credential=True):
    token = os.popen('gcloud auth print-access-token').read()
    token = re.sub(f'\n$', '', token)
    credentials = google.oauth2.credentials.Credentials(token)

    bq_client = bigquery.Client(project=project_id)
    if use_local_credential:
        bq_client = bigquery.Client(project=project_id, credentials=credentials)
    return bq_client

client = get_gcp_bqclient(project_id)

df_train = import_bq_to_dataframe(project_id, dataset_id, train_table_id, client) #-- Jan to Oct 
df_val = import_bq_to_dataframe(project_id, dataset_id, val_table_id, client) #-- Nov and Dec
df_test = import_bq_to_dataframe(project_id, dataset_id, test_table_id, client) #-- Nov and Dec

scenario_to_target = {
    'hsic_acquisition': 0,
    'ttv_acquisition': 1,
    'sing_acquisition': 2,
    'shs_acquisition': 3, 
    'tos_acquisition': 4, 
    'wifi_acquisition': 5, 
    'lwc_acquisition': 6, 
    'sws_acquisition': 7, 
    'hpro_acquisition': 8, 
    'whsia_acquisition': 9, 
    'ttv_upsell': 10, 
    'shs_renewal': 11, 
    'shs_upsell': 12
    #'mobility_acquisition': 8,
    #'tos_upsell': 8
}

df_train['target'] = df_train['model_scenario'].map(scenario_to_target)
df_val['target'] = df_val['model_scenario'].map(scenario_to_target)
df_test['target'] = df_test['model_scenario'].map(scenario_to_target)

print(f'df_train: {df_train.shape}')
print(f'df_val: {df_val.shape}')
print(f'df_test: {df_test.shape}')


### fillna text cols

In [None]:
df_train['demogr_lifestage_sort'].fillna(6, inplace=True)
df_train['cust_pref_lang_txt'].fillna('NOT AVAILABLE', inplace=True)
df_train['demogr_census_division_typ'].fillna('NA', inplace=True)
df_train['cust_prov_state_cd'].fillna('N/AVAIL', inplace=True)
df_train['cust_cr_val_txt'].fillna('NOT AVAILABLE', inplace=True)
df_train['acct_ebill_ind'].fillna('N', inplace=True)

df_val['demogr_lifestage_sort'].fillna(6, inplace=True)
df_val['cust_pref_lang_txt'].fillna('NOT AVAILABLE', inplace=True)
df_val['demogr_census_division_typ'].fillna('NA', inplace=True)
df_val['cust_prov_state_cd'].fillna('N/AVAIL', inplace=True)
df_val['cust_cr_val_txt'].fillna('NOT AVAILABLE', inplace=True)
df_val['acct_ebill_ind'].fillna('N', inplace=True)

df_test['demogr_lifestage_sort'].fillna(6, inplace=True)
df_test['cust_pref_lang_txt'].fillna('NOT AVAILABLE', inplace=True)
df_test['demogr_census_division_typ'].fillna('NA', inplace=True)
df_test['cust_prov_state_cd'].fillna('N/AVAIL', inplace=True)
df_test['cust_cr_val_txt'].fillna('NOT AVAILABLE', inplace=True)
df_test['acct_ebill_ind'].fillna('N', inplace=True)

print(f'df_train: {df_train.shape}')
print(f'df_val: {df_val.shape}')
print(f'df_test: {df_val.shape}')

### define train, val, test sets

In [None]:
#train test split
# df train - Jan to Aug 
# df train - Sep to Oct 
df_train, df_test = train_test_split(df_train, test_size=0.3, random_state=42, stratify=df_train['target'])

# df_train.to_csv('gs://{}/{}/{}_train_final.csv'.format(FILE_BUCKET, SERVICE_TYPE, SERVICE_TYPE), index=False)
# df_test.to_csv('gs://{}/{}/{}_test_final.csv'.format(FILE_BUCKET, SERVICE_TYPE, SERVICE_TYPE), index=False)
# df_val.to_csv('gs://{}/{}/{}_val_final.csv'.format(FILE_BUCKET, SERVICE_TYPE, SERVICE_TYPE), index=False)

#set up features (list)
cols_1 = df_train.columns.values
cols_2 = df_val.columns.values
cols_3 = df_test.columns.values

cols = set(cols_1).intersection(set(cols_2))
cols = set(cols).intersection(set(cols_3))

features_to_exclude = ['split_type','model_scenario','ref_dt','cust_id','cust_src_id','ban','ban_src_id','lpds_id',
                       'fms_address_id','label','label_dt', 'prod_latest_actvn_dt', 'prod_latest_deactvn_dt', 'target', 
                       'contract_end_date_hsic', 'contract_end_date_hsic', 'contract_end_date_sing', 'contract_end_date_ttv', 'contract_end_date_smhm'] 

features = [f for f in cols if f not in features_to_exclude]

ban_train = df_train[['ban', 'lpds_id']]
X_train = df_train[features]
y_train = np.squeeze(df_train['target'].values)
target_train = df_train['target']

ban_val = df_val[['ban', 'lpds_id']]
X_val = df_val[features]
y_val = np.squeeze(df_val['target'].values)
target_val = df_val['target']

ban_test = df_test[['ban', 'lpds_id']]
X_test = df_test[features]
y_test = np.squeeze(df_test['target'].values)
target_test = df_test['target']

### label encoder

In [None]:
# Now we need to transform the features of the feature store.
def encode_categorical_features(train_df, test_df, val_df):
    # Get a list of all categorical columns
    cat_columns = train_df.select_dtypes(include=['object', 'category']).columns.tolist()

    # Encode each categorical column
    for col in cat_columns:
        le = LabelEncoder()
        train_df[col] = le.fit_transform(train_df[col])
        test_df[col] = le.fit_transform(test_df[col])
        val_df[col] = le.fit_transform(val_df[col])
        
    return train_df, val_df, test_df

#excluding the customer ID so it doesn't get encoded
train_label_data=X_train[X_train.columns.difference(['cust_id','ban','lpds_id','ref_dt','model_scenario'])]
val_label_data=X_val[X_val.columns.difference(['cust_id','ban','lpds_id','ref_dt','model_scenario'])]
test_label_data=X_test[X_test.columns.difference(['cust_id','ban','lpds_id','ref_dt','model_scenario'])]

X_train, X_val, X_test = encode_categorical_features(train_label_data, val_label_data, test_label_data)

#set up features (list)
cols_1 = X_train.columns.values
cols_2 = X_val.columns.values
cols_3 = X_test.columns.values

cols = set(cols_1).intersection(set(cols_2))
cols = set(cols).intersection(set(cols_3))

features = [f for f in cols if f not in features_to_exclude]

X_train = X_train[features] 
X_val = X_val[features]
X_test = X_test[features] 

X_train_array = X_train.values
X_val_array = X_val.values
X_test_array = X_test.values

X_train.head()

In [None]:
# # define baseline model
# def baseline_model():
#     # create model
#     model = Sequential()
#     model.add(Dense(8, input_dim=176, activation='relu'))
#     model.add(Dense(3, activation='relu'))
#     model.add(Dense(10, activation='softmax'))
    
#     # Compile model
#     model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
#     return model

# estimator = KerasClassifier(build_fn=baseline_model, epochs=200, batch_size=5, verbose=0)

# kfold = KFold(n_splits=10, shuffle=True)

# results = cross_val_score(estimator, X_train, y_train, cv=kfold)

# print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

In [None]:
def get_model(hidden_layer_dim, meta):
    # note that meta is a special argument that will be
    # handed a dict containing input metadata
    n_features_in_ = meta["n_features_in_"]
    X_shape_ = meta["X_shape_"]
    n_classes_ = meta["n_classes_"]

    model = keras.models.Sequential()
    model.add(keras.layers.Dense(n_features_in_, input_shape=X_shape_[1:]))
    model.add(keras.layers.Activation("relu"))
    model.add(keras.layers.Dense(hidden_layer_dim))
    model.add(keras.layers.Activation("relu"))
    model.add(keras.layers.Dense(n_classes_))
    model.add(keras.layers.Activation("softmax"))
    return model

clf = KerasClassifier(
    get_model,
    loss="sparse_categorical_crossentropy",
    hidden_layer_dim=100,
)

clf.fit(X_train, y_tarin)
y_proba = clf.predict_proba(X_val)