In [None]:
#### import global modules
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
from yaml import safe_load
import google.oauth2.credentials
from google.cloud import bigquery
import gc

# Set global vars
pth_project = Path(os.getcwd().split('notebooks')[0])
pth_data = pth_project / 'data'
pth_queries = pth_project / 'core' / 'queries'
pth_creds = pth_project / 'conf' / 'local' / 'project_config.yaml'
sys.path.insert(0, str(pth_project))
d_project_config = safe_load(pth_creds.open())
# d_params = safe_load((pth_project / 'core' / 'parameters' / 'common.yaml').open())['data_extract']

# import local modules
from core.utils.gcp import connect_bq_services
# from core.etl.extract import extract_bq_data, extract_pr_codes, format_conv_df, filter_convs

# Connect to google services
bq_client = connect_bq_services(d_project_config['gcp-project-name'])
pd.options.display.max_rows = 100
from sklearn.model_selection import train_test_split


In [None]:
def extract_bq_data(bq_client, sql=None, pth_query=None):
    if sql is not None:
        df = bq_client.query(sql).to_dataframe()
    elif pth_query is not None:
        sql = pth_query.read_text()
        df = bq_client.query(sql).to_dataframe()
    else:
        raise ValueError('`sql` or `pth_query` should be set')  
    return df

In [None]:
Query= '''

select * from `divgpras-pr-579355.SHS.SHS_churn_model_train_data_dec22_feb23`

'''

In [None]:
DF=extract_bq_data(bq_client, sql=Query)

In [None]:
DF.info()

In [None]:
# DF.to_csv('SHS_churn_model_train_data_sep_nov_2022.csv',index=False)

In [None]:
# DF['login_consistency']=DF['login_consistency'].astype('int')

In [None]:
# DF['Best_Practices_1']=DF['Best_Practices_1'].fillna(0,inplace=True)
# DF['Best_Practices_2']=DF['Best_Practices_2'].fillna(0,inplace=True)
# DF['Best_Practices_3']=DF['Best_Practices_3'].fillna(0,inplace=True)
# DF['Best_Practices_4']=DF['Best_Practices_4'].fillna(0,inplace=True)
# DF['Best_Practices_5']=DF['Best_Practices_5'].fillna(0,inplace=True)
# DF['Best_Practices_6']=DF['Best_Practices_6'].fillna(0,inplace=True)
# DF['Best_Practice_All']=DF['Best_Practice_All'].fillna(0,inplace=True)

In [None]:
DF['Best_Practices_1'].value_counts()

In [None]:
ID_cols=['customer_id','dealer_customer_id','BAN']
ignore_cols=['primary_login_id','dealer_name','join_date','account_type_name','customer_type_name','cust_bus_cust_id'
             ,'contract_start_date','contract_end_date','customer_id_1','customer_id_2','customer_id_3','ban_1','ban_2'
             ,'BAN_3','sum_arm_commands','sum_disarm_commands','number_days_arming_disarming','count_of_dates_arming'
             ,'sum_login_count','number_of_login_days','count_of_dates_login','shs_count','Telus_Cust_ID','Best_Practice_All','Intial_activation_date','Telus_ID']

Target_col=['SMHM_Churn']

In [None]:
DF['Avg_SMHM_ARPU']=pd.to_numeric(DF['Avg_SMHM_ARPU'])
DF['Avg_FFH_ARPU']=pd.to_numeric(DF['Avg_FFH_ARPU'])
# DF['Avg_SMHM_ARPU']=pd.to_numeric(DF['Avg_SMHM_ARPU'])


In [None]:
# DF['Best_Practices_1']=DF['Best_Practices_1'].astype('category')
# DF['Best_Practices_2']=DF['Best_Practices_2'].astype('category')
# DF['Best_Practices_3']=DF['Best_Practices_3'].astype('category')
# DF['Best_Practices_4']=DF['Best_Practices_4'].astype('category')
# DF['Best_Practices_5']=DF['Best_Practices_5'].astype('category')
# DF['Best_Practices_6']=DF['Best_Practices_6'].astype('category')
# DF['Best_Practice_All']=DF['Best_Practice_All'].astype('category')
# DF['SMHM_Churn']=DF['SMHM_Churn'].astype('category')



In [None]:

num_cols=DF.select_dtypes(include=np.number).columns.tolist()
cat_cols=DF.select_dtypes(exclude=np.number).columns.tolist()

num_cols_features=list(set(num_cols)-set(ID_cols)-set(Target_col)-set(ignore_cols))
cat_cols_features=list(set(cat_cols)-set(ID_cols)-set(Target_col)-set(ignore_cols))
feature_cols=cat_cols_features+num_cols_features

In [None]:
len(feature_cols)

In [None]:
num_cols_features

In [None]:
cat_cols_features

In [None]:
DF[feature_cols].isnull().sum() * 100 / len(DF[feature_cols])

In [None]:
pd.DataFrame(DF.groupby(['Package']).agg(
    
    Customer_count= ('Telus_ID','nunique'),
    SMHM_ARPU=('Avg_SMHM_ARPU','mean')
    # Email_Opened_count=('OPENED','sum'),
    # CT_Opened_count=('CLICKTHROUGH','sum'),
    # softbounce_count=('SOFTBOUNCE','sum'),
    # Hardbounce_count=('HARDBOUNCE','sum'),
    # # Unsub_count=('UNSUBSCRIBE','sum'),
    # campaign_date_min=('Campaign_date','min'),
    # campaign_date_max=('Campaign_date','max')

    
    # # Customer_Share= ('customer_id',lambda x:x.count()*100/Merge_DF_4.shape[0])
    # Churn_total=('Telus_Churn_Flag',lambda x: x.sum()),
    # Churn_rate=('Telus_Churn_Flag',lambda x: x.mean()*100)
).reset_index())

In [None]:
DF['login_consistency'].isnull().sum()

In [None]:
def login_consistency_cat(row):
    if row['login_consistency']>30:
        return 'Heavy_User'
    elif row['login_consistency']>0 and row['login_consistency']<=30:
        return 'Low_User'
    elif row['Interactivity_flag']==1:
        return 'No_User'
    elif row['Interactivity_flag']==0:
        return 'No_ADC_account'
    else:
        return 'None_of_the_above'
           

In [None]:
def arming_consistency_cat(row):
    if row['arming_consistency']>30:
        return 'Heavy_User'
    elif row['arming_consistency']>0 and row['arming_consistency']<=30:
        return 'Low_User'
    elif row['Interactivity_flag']==1:
        return 'No_User'
    elif row['Interactivity_flag']==0:
        return 'No_ADC_account'
    else:
        return 'None_of_the_above'

In [None]:
DF['login_consistency_cat']=DF.apply(login_consistency_cat,axis=1)

In [None]:
DF['arming_consistency_cat']=DF.apply(arming_consistency_cat,axis=1)

In [None]:
DF['login_consistency_cat'].value_counts()

In [None]:
feature_cols_1=list(set(feature_cols)-set(['arming_consistency','login_consistency']))+['login_consistency_cat','arming_consistency_cat']


In [None]:
feature_cols_1

In [None]:
# DF.info()

In [None]:
DF[feature_cols_1].isnull().sum() * 100 / len(DF[feature_cols_1])

In [None]:
# DF[DF['login_consistency_cat']=='No_User'][['Telus_ID','customer_id','login_consistency','Interactivity_flag','login_consistency_cat']]

In [None]:
def impute_missing_values(dataset=DF):
    
    for cols in dataset.columns:
        if dataset[cols].dtype == np.dtype('O'):
            # df.loc[np.isnan(df["Age"]), 'Age'] = rand1
            #dataset.loc[dataset[cols].isnull(), cols]="Missing_category"
            dataset[cols]=dataset[cols].fillna("Missing_category",inplace=False)
        else:
            dataset[cols]=dataset[cols].fillna(0,inplace=False)
            #dataset.loc[dataset[cols].isnull(), cols]=0
    return dataset

 
DF[feature_cols_1]=impute_missing_values(dataset=DF[feature_cols_1])

In [None]:
DF['SMHM_Churn'].value_counts()

In [None]:
def data_splitting(dataset=DF,feature_cols=[],Target=[],testsize=0.3):
    
    
    X=dataset[feature_cols]
    Y=dataset[Target]
    X1=pd.get_dummies(X,prefix_sep='__')
    X_train, X_validation, Y_train, Y_validation = train_test_split(X1, Y, test_size=testsize, random_state=11,stratify=Y)
    final_feature_columns=X_train.columns.tolist()
    print()
    print("Data splitting Completed.")
    return X_train,X_validation,Y_train,Y_validation,final_feature_columns
# %%
X_train,X_validation,Y_train,Y_validation,final_feature_columns=data_splitting(
dataset=DF,feature_cols=feature_cols_1,Target=Target_col,testsize=0.3)

In [None]:
X_train.shape

In [None]:
final_feature_columns

In [None]:
import json

with open('Final_feature_columns.json', 'w') as Ffp:
    json.dump(final_feature_columns, Ffp)


with open('Intial_feature_columns.json', 'w') as Ifp:
    json.dump(feature_cols_1, Ifp)

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import make_scorer

def Capture_Rate_1stSD(ground_truth, prediction):
    """ this functions takes in the model
    test features, targets, will return the
    lift dataframe
    """
    #print("data shape:", ground_truth.shape)

    prediction_df = pd.DataFrame(prediction, columns=["prob_of_predict_1"])
    # print(prediction_df.head())
    y_df = pd.DataFrame(ground_truth, columns=['target'])
    prediction_with_label_df = y_df.join(prediction_df.set_index(y_df.index))
    # Calculate lift on validation data
    v1 = prediction_with_label_df[['target', 'prob_of_predict_1']]
    v1.sort_values(by=['prob_of_predict_1'], inplace=True)
    v1['Qcut'] = pd.qcut(v1['prob_of_predict_1'].rank(
    method='first'), q=20, labels=False)
    v1['SD'] = 20-(v1['Qcut'].astype(int))
    vl1 = v1.drop(['Qcut'], axis=1)
    vl1['Freq'] = vl1.groupby('SD')['target'].transform(np.sum)
    vl2 = vl1.drop(['target', 'prob_of_predict_1'], axis=1).drop_duplicates()
    vl2.sort_values(by=['SD'], inplace=True)
    vl2['Cumm Sum'] = vl2['Freq'].cumsum()
    vl2['Lift'] = vl2['Freq'].div(vl2['Freq'].sum(), axis=0).div(0.05)
    vl2['Cumm Lift'] = vl2['Cumm Sum'].div(
    vl2['Freq'].sum(), axis=0).div(vl2['SD']/20)
    vl2['Capture Rate'] = vl2['Lift']*5
    #vl2['Capture Rate_defintion'] = vl2['Freq']/vl2.iloc[19, 'Cumm Sum']
    vl2['Cummu Capture Rate'] = vl2['Capture Rate'].cumsum()

    # this is lift csv
    # vl2.to_csv("/lift_chart_{0}.csv".format(today))
    # vl2.to_csv(
    # f'lift_of_{model}_on_{datetime.today().strftime("%Y-%m-%d")}.csv')
    # print(f'lift_of_{model}_on_{datetime.today().strftime("%Y-%m-%d")}.csv')
    #print("Training process completed success")
    Magic_score= round(vl2.iloc[0, 6],2)
    print("Cumm Capture Rate on 1st SD is: ", Magic_score)
    #vl2[vl2['SD'] == 1, 'Lift'].iloc[0]
    return Magic_score



def lift_calc_pd(model, X_test, y_test,target_column): # , model_path
    """ this functions takes in the model
    test features, targets, will return the
    lift dataframe
    """
    prediction = model.predict_proba(X_test)
    print("output shape", prediction.shape)
    print("y_test shape:", y_test.shape)
    #Total_Actual_positive_targets=sum(y_test[target_column])
    #print('Total_Actual_positive_targets:',Total_Actual_positive_targets)


    prediction_df = pd.DataFrame(
    prediction, columns=['prob_of_predict_0', 'prob_of_predict_1'])
    y_df = pd.DataFrame(y_test, columns=[target_column])
    prediction_with_label_df = y_df.join(prediction_df.set_index(y_df.index))


    # Calculate lift on validation data
    v1 = prediction_with_label_df[[target_column, 'prob_of_predict_1']]
    v1.sort_values(by=['prob_of_predict_1'], inplace=True)
    v1['Qcut'] = pd.qcut(v1['prob_of_predict_1'].rank(
    method='first'), q=20, labels=False)
    v1['SD'] = 20-(v1['Qcut'].astype(int))


    vl1 = v1.drop(['Qcut'], axis=1)
    vl1['Freq'] = vl1.groupby('SD')[target_column].transform(np.sum)
    vl2 = vl1.drop([target_column, 'prob_of_predict_1'], axis=1).drop_duplicates()
    vl2.sort_values(by=['SD'], inplace=True)
    vl2['Cumm Sum'] = vl2['Freq'].cumsum()
    vl2['Lift'] = vl2['Freq'].div(vl2['Freq'].sum(), axis=0).div(0.05)
    vl2['Cumm Lift'] = vl2['Cumm Sum'].div(vl2['Freq'].sum(), axis=0).div(vl2['SD']/20)
    vl2['Capture Rate'] = vl2['Lift']*5
    vl2['Cummu Capture Rate'] = vl2['Capture Rate'].cumsum()
    # vl2['Capture_Rate_Defintion']= vl2['Freq']*100/Total_Actual_positive_targets
    # vl2['Cummu_Capture_Rate_Defintion']= vl2['Capture_Rate_Defintion'].cumsum()
    return pd.DataFrame(vl2)


In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

In [None]:
Algo= 'XGB_SMHM'
#Optimization='TuneCV_BOHB_6thSD_Lift'
Optimization='RandomSearch_CaptureRate'

custom_scorer = make_scorer(
    Capture_Rate_1stSD, greater_is_better=True, needs_proba=True)

In [None]:
params = {
    "gamma": list(range(1, 21)),
    "subsample": np.round(np.arange(0.2, 1, 0.1),2).tolist(),
    "colsample_bytree": np.round(np.arange(0.2, 1, 0.1),2).tolist(),
    "n_estimators": np.arange(10, 200, 10).tolist(), 
    "max_depth": list(range(3, 7)),
    # "scale_pos_weight": list(range(1, 20)),
    "learning_rate": np.round(np.arange(0.001, 0.3, 0.005),3).tolist()
}

In [None]:
params_1 = {
    "n_estimators": [10,20]
}

In [None]:
xgb = XGBClassifier(
    #learning_rate=0.02,
    #n_estimators=50,
    # objective="binary:logistic",
    nthread=-1,
    verbosity=1,
    use_label_encoder=False
    # enable_categorical=True
    # tree_method="gpu_hist"  # this enables GPU.
    # See https://github.com/dmlc/xgboost/issues/2819
)

In [None]:
Y_train.value_counts()

In [None]:
# Y_train=Y_train.astype('category')

In [None]:
Y_train.isnull().sum()

In [None]:
Y_train.dtypes

In [None]:
search = RandomizedSearchCV(xgb, param_distributions=params_1,cv=3,error_score="raise")

In [None]:
search_1=GridSearchCV(estimator=xgb,param_grid=params_1,scoring=custom_scorer)

In [None]:
Y_train.values

In [None]:
# search_1.fit(X_train,Y_train.values)

In [None]:
xgb.fit(X_train,Y_train)


In [None]:
# search_1.best_params_

In [None]:
# pd.DataFrame(search_1.cv_results_)

In [None]:
val_DF_metric=lift_calc_pd(model=xgb,X_test=X_validation
,y_test=Y_validation,target_column=Target_col[0])

In [None]:
val_DF_metric

In [None]:
X_data=pd.concat([X_validation,Y_validation],axis=1)

In [None]:
x_SC=X_data[X_data.Package__Smart_Camera==1]
y_sc=x_SC[Target_col]


Test_lift=lift_calc_pd(model=xgb,X_test=x_SC[final_feature_columns]
,y_test=y_sc,target_column=Target_col[0])
Test_lift.to_csv('Smart_Camera_Lift.csv',index=False)

In [None]:
x_SC=X_data[X_data.Package__Smart_Automation_Plus==1]
y_sc=x_SC[Target_col]


Test_lift=lift_calc_pd(model=xgb,X_test=x_SC[final_feature_columns]
,y_test=y_sc,target_column=Target_col[0])
Test_lift.to_csv('Smart_Automation_Lift.csv',index=False)

In [None]:
x_SC=X_data[X_data.Package__Monitored==1]
y_sc=x_SC[Target_col]


Test_lift=lift_calc_pd(model=xgb,X_test=x_SC[final_feature_columns]
,y_test=y_sc,target_column=Target_col[0])
Test_lift.to_csv('Monitored_Lift.csv',index=False)

In [None]:
# x_SC=X_data[X_data.Package__Unknown_Package==1]
# y_sc=x_SC[Target_col]


# Test_lift=lift_calc_pd(model=xgb,X_test=x_SC[final_feature_columns]
# ,y_test=y_sc,target_column=Target_col[0])
# Test_lift.to_csv('Unknown_Package_Lift.csv',index=False)

In [None]:
final_feature_columns

In [None]:
feature_important = xgb.get_booster().get_score(importance_type='weight')
keys = list(feature_important.keys())
values = list(feature_important.values())

data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by = "score", ascending=False)
data.nlargest(15, columns="score").plot(kind='barh', figsize = (20,10)) ## plot top 40 features


In [None]:
import shap
# model_shap = XGB_final_model.best_estimator_
explainer = shap.TreeExplainer(xgb)
shap_values = explainer.shap_values(X_train,check_additivity=False)
#%%

shap.summary_plot(shap_values, features=X_train
, feature_names=final_feature_columns,show=False,max_display=15)

In [None]:
explainer = shap.TreeExplainer(xgb)
shap_values = explainer.shap_values(X_validation,check_additivity=False)
#%%

shap.summary_plot(shap_values, features=X_validation
, feature_names=final_feature_columns,show=False,max_display=15)

In [None]:
# Y_train=Y_train.astype('int')
# Y_validation=Y_validation.astype('int')

In [None]:
import datetime
import joblib

date_time_now=datetime.datetime.now().strftime("%Y%m%d_%I%M%S")
Model_filename_1=Algo+'_'+'SMHM_churn_model'+'_'+date_time_now+'.pkl'
joblib.dump(xgb,Model_filename_1)


In [None]:
# X_train=X_train.astype('int')
# X_validation=X_validation.astype('int')

In [None]:
X_train.dtypes

In [None]:
Y_train=Y_train.astype('float64')
Y_validation=Y_validation.astype('float64')

In [None]:
X_train['ttv_count']=X_train['ttv_count'].astype('float64')
X_train['sing_count']=X_train['sing_count'].astype('float64')
X_train['hsic_count']=X_train['hsic_count'].astype('float64')
X_train['product_mix_all']=X_train['product_mix_all'].astype('float64')
X_train['diic_count']=X_train['diic_count'].astype('float64')
X_train['stv_count']=X_train['stv_count'].astype('float64')

In [None]:
X_validation['ttv_count']=X_validation['ttv_count'].astype('float64')
X_validation['sing_count']=X_validation['sing_count'].astype('float64')
X_validation['hsic_count']=X_validation['hsic_count'].astype('float64')
X_validation['product_mix_all']=X_validation['product_mix_all'].astype('float64')
X_validation['diic_count']=X_validation['diic_count'].astype('float64')
X_validation['stv_count']=X_validation['stv_count'].astype('float64')

In [None]:
Y_train.dtypes

In [None]:
Y_train.info()

In [None]:
Y_train.head()

In [None]:
# Y_train_1=Y_train['SMHM_Churn'].astype('int').to_list()
# Y_validation_1=Y_validation['SMHM_Churn'].astype('int').to_list()

In [None]:
# Y_train_1 = Y_train['SMHM_Churn'].astype('int')
# Y_validation_1 = Y_validation['SMHM_Churn'].astype('int')

In [None]:
Y_train_1=np.squeeze(Y_train['SMHM_Churn'].values)
Y_validation_1 = np.squeeze(Y_validation['SMHM_Churn'].values)

In [None]:
# X_train['ttv_count']=X_train['ttv_count'].to_list()
# X_train['sing_count']=X_train['sing_count'].to_list()
# X_train['hsic_count']=X_train['hsic_count'].to_list()
# X_train['product_mix_all']=X_train['product_mix_all'].to_list()
# X_train['diic_count']=X_train['diic_count'].to_list()
# X_train['stv_count']=X_train['stv_count'].to_list()

In [None]:
# X_validation['ttv_count']=X_validation['ttv_count'].to_list()
# X_validation['sing_count']=X_validation['sing_count'].to_list()
# X_validation['hsic_count']=X_validation['hsic_count'].to_list()
# X_validation['product_mix_all']=X_validation['product_mix_all'].to_list()
# X_validation['diic_count']=X_validation['diic_count'].to_list()
# X_validation['stv_count']=X_validation['stv_count'].to_list()

In [None]:
X_train.reset_index(drop=True,inplace=True)
X_validation.reset_index(drop=True,inplace=True)
# X_test.reset_index(drop=True,inplace=True)

In [None]:
# import lightgbm as lgb

# model_lgb = lgb.LGBMClassifier(
#                         n_jobs=-1,
#                         n_estimators=100000,
#                         learning_rate=0.01,
#                         num_leaves=64,
#                         num_threads=4,
#                         max_depth=-1,
#                         tree_learner='serial',
#                         feature_fraction=0.7,
#                         bagging_freq=5,
#                         bagging_fraction=0.7,
#                         verbose=-1,
#                         bagging_seed=11
#                     )


# model_lgb.fit(X_train, Y_train_1, early_stopping_rounds=200, eval_set=[(X_train,Y_train_1),(X_validation, Y_validation_1)],eval_metric=custom_scorer,verbose=500)

In [None]:
Y_train.values.flatten()

In [None]:
Y_train['SMHM_Churn'].unique()