In [None]:
#### import global modules
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
from yaml import safe_load
import google.oauth2.credentials
from google.cloud import bigquery
import gc

# Set global vars
pth_project = Path(os.getcwd().split('notebooks')[0])
pth_data = pth_project / 'data'
pth_queries = pth_project / 'core' / 'queries'
pth_creds = pth_project / 'conf' / 'local' / 'project_config.yaml'
sys.path.insert(0, str(pth_project))
d_project_config = safe_load(pth_creds.open())
# d_params = safe_load((pth_project / 'core' / 'parameters' / 'common.yaml').open())['data_extract']

# import local modules
from core.utils.gcp import connect_bq_services
# from core.etl.extract import extract_bq_data, extract_pr_codes, format_conv_df, filter_convs

# Connect to google services
bq_client = connect_bq_services(d_project_config['gcp-project-name'])
pd.options.display.max_rows = 100

In [None]:
def extract_bq_data(bq_client, sql=None, pth_query=None):
    if sql is not None:
        df = bq_client.query(sql).to_dataframe()
    elif pth_query is not None:
        sql = pth_query.read_text()
        df = bq_client.query(sql).to_dataframe()
    else:
        raise ValueError('`sql` or `pth_query` should be set')  
    return df

In [None]:
Customer_Details_Query='''

SELECT customer_id,dealer_customer_id,primary_login_id,dealer_name,join_date,account_type_name,customer_type_name,primary_email,primary_phone,last_updt_ts
FROM `cio-datahub-enterprise-pr-183a.src_adc.bq_customer_account_details` 
where last_updt_ts >='2022-08-01'  and last_updt_ts < '2022-10-01'  
and account_type_name!='Standalone'
and customer_type_name='Customer'
QUALIFY ROW_NUMBER() OVER (PARTITION BY customer_id ORDER BY last_updt_ts DESC) = 1
order by dealer_name,join_date,customer_id


'''

In [None]:
Customer_DF=extract_bq_data(bq_client, sql=Customer_Details_Query)

In [None]:
Customer_DF.info()

In [None]:
Customer_DF.head()

In [None]:
TC_Query= '''


select * from `divgpras-pr-579355.ADC.Trouble_Condition_Group_last3months_Aug2022`

'''


In [None]:
TC_Data=extract_bq_data(bq_client, sql=TC_Query)

In [None]:
TC_Data.info()

In [None]:
TC_Data['TC_flag']=1

In [None]:
Customer_DF_1=Customer_DF.merge(TC_Data,on='customer_id',how='left')

In [None]:
Customer_DF_1.info()

In [None]:
Customer_DF_1['TC_flag'].isna().sum()/Customer_DF_1.shape[0]

In [None]:
Customer_DF_1['TC_flag'].fillna(0,inplace=True)

In [None]:
Customer_DF_1['TC_flag'].value_counts(normalize=True)*100

In [None]:
Camera_Query='''

select customer_id, 1 as Camera_flag from `cio-datahub-enterprise-pr-183a.src_adc.bq_camera`
group by customer_id
order by customer_id


'''

In [None]:
Camera_DF=extract_bq_data(bq_client, sql=Camera_Query)

In [None]:
Camera_DF.info()

In [None]:
Customer_DF_2=Customer_DF_1.merge(Camera_DF,on='customer_id',how='left')

In [None]:
del Customer_DF
gc.collect()

In [None]:
Customer_DF_2['Camera_flag'].fillna(0,inplace=True)

In [None]:
Customer_DF_2['Camera_flag'].value_counts(normalize=True)*100

In [None]:
Camera_TC_Query='''

select dealer_name,customer_id,unit_id,device_id,trouble_condition_name,trouble_condition_group_id,trouble_condition_group_desc, start_date_utc,end_date_utc
from `cio-datahub-enterprise-pr-183a.src_adc.bq_troublecondition_data`
where trouble_condition_start_ind='Y'and trouble_condition_closed_ind='Y' and date(start_date_utc) >= '2022-05-01' and date(start_date_utc) <'2022-08-01' and trouble_condition_name in ('CameraNotReachable','CameraNotCommunicating','BroadbandCommFailure')

'''

In [None]:
Camera_TC_DF_May_Aug2022=extract_bq_data(bq_client, sql=Camera_TC_Query)

In [None]:
Camera_TC_DF_May_Aug2022.info()

In [None]:
Camera_TC_DF_May_Aug2022['customer_id'].nunique()

In [None]:
Camera_TC_DF_May_Aug2022['Camera_TC_issue_last3M_flag']=1

In [None]:
Camera_TC_DF_May_Aug2022_customer_list=Camera_TC_DF_May_Aug2022[['customer_id','Camera_TC_issue_last3M_flag']].drop_duplicates()

In [None]:
Camera_TC_DF_May_Aug2022_customer_list.info()

In [None]:
Customer_DF_3=Customer_DF_2.merge(Camera_TC_DF_May_Aug2022_customer_list,on='customer_id',how='left')

In [None]:
del Customer_DF_1
gc.collect()

In [None]:
Customer_DF_3['Camera_TC_issue_last3M_flag'].fillna(0,inplace=True)

In [None]:
Customer_DF_3['Camera_TC_issue_last3M_flag'].value_counts(normalize=True)*100

In [None]:
Termination_Details='''

select customer_id,term_date,term_reason,1 as Churn_flag from `cio-datahub-enterprise-pr-183a.src_adc.bq_customer_account_terminated_details`
where date(term_date) >='2022-08-01'  and date(term_date) < '2022-10-01' 
QUALIFY ROW_NUMBER() OVER (PARTITION BY customer_id ORDER BY last_updt_ts DESC) = 1

'''

In [None]:
Termination_DF=extract_bq_data(bq_client, sql=Termination_Details)

In [None]:
Termination_DF.info()

In [None]:
Termination_DF['customer_id'].nunique()

In [None]:
Customer_DF_4=Customer_DF_3.merge(Termination_DF,on='customer_id',how='left')

In [None]:
Customer_DF_4.info()

In [None]:
del Customer_DF_2
gc.collect()

In [None]:
Customer_DF_4['Churn_flag'].fillna(0,inplace=True)

In [None]:
Customer_DF_4['Churn_flag'].value_counts(normalize=True)*100

In [None]:
Customer_DF_4.head()

In [None]:
Customer_DF_4['Camera_and_any_TC_issue']=np.where((Customer_DF_4.TC_flag==1) & (Customer_DF_4.Camera_flag==1),1,0)

In [None]:
Best_Practices_Query= '''

select * from `divgpras-pr-579355.ADC.Best_Practices_Customers`

'''



In [None]:
Best_Practices_DF=extract_bq_data(bq_client, sql=Best_Practices_Query)

In [None]:
Best_Practices_DF.info()

In [None]:
Customer_DF_5=Customer_DF_4.merge(Best_Practices_DF,on='customer_id',how='left')

In [None]:
del Customer_DF_3
gc.collect()

In [None]:
Customer_DF_5.info()

In [None]:
Customer_DF_5['Best_Practice_All'].isna().sum()

In [None]:
Customer_DF_5['Best_Practice_All'].fillna(0,inplace=True)

In [None]:
Customer_DF_5['Best_Practice_All'].value_counts(normalize=True)*100

In [None]:
Customer_DF_5['Best_Practice_All_flag']=Customer_DF_5['Best_Practice_All'].apply(lambda x: 1 if x==6 else 0)

In [None]:
Customer_DF_5['Best_Practice_All_flag'].value_counts(normalize=True)*100

In [None]:
Customer_DF_5['Churn_flag'].value_counts(normalize=True)*100

In [None]:
Customer_DF_5.info()

In [None]:
Customer_DF_5.fillna(0,inplace=True)

In [None]:
Arming_Query='''

with max_date_sql as (
select  max(date(date)) as max_date from `cio-datahub-enterprise-pr-183a.src_adc.bq_aggregate_daily_arming_commands`
)

select id_cust as customer_id,sum(count_arm_commands) as sum_arm_commands_last15days,sum(count_disarm_commands) as sum_disarm_commands_last15days,count (distinct date(date)) as number_days_arming_disarming_last15days
from `cio-datahub-enterprise-pr-183a.src_adc.bq_aggregate_daily_arming_commands`
where date(date) <= (select max_date from max_date_sql)

and date(date) > DATE_ADD((select max_date from max_date_sql),interval -15 DAY)
group by customer_id
order by customer_id


'''

In [None]:
Arming_DF=extract_bq_data(bq_client, sql=Arming_Query)

In [None]:
Arming_DF.info()

In [None]:
Arming_DF.head()

In [None]:
Arming_DF['Arming_Consistency_last15days']=Arming_DF['number_days_arming_disarming_last15days'].apply(lambda x: x*100/15)

Arming_DF['Arming_flag_last15days']=1

In [None]:
Arming_DF['Arming_Consistency_last15days'].value_counts(normalize=True)*100

In [None]:
Customer_DF_6=Customer_DF_5.merge(Arming_DF,on='customer_id',how='left')

In [None]:
del Customer_DF_4
gc.collect()

In [None]:
Customer_DF_6['Arming_flag_last15days'].isna().sum()*100/Customer_DF_6.shape[0]


In [None]:
unique_customer_count=Customer_DF_6.shape[0]


In [None]:
Customer_DF_6.fillna(0,inplace=True)

In [None]:
pd.DataFrame(Customer_DF_6.groupby(['Arming_flag_last15days']).agg(
    Unique_Customers=('customer_id','nunique'), 
        Share_of_Customers=('customer_id',lambda x: round(x.nunique()*100/unique_customer_count,2)),    

    Churn_counts = ('Churn_flag','sum'),
     Churn_rate = ('Churn_flag',lambda x: round(x.mean()*100,2)),
     ).reset_index().sort_values('Arming_flag_last15days',ascending=False))

In [None]:
pd.DataFrame(Customer_DF_6.groupby(['Arming_Consistency_last15days']).agg(
    Unique_Customers=('customer_id','nunique'), 
    Share_of_Customers=('customer_id',lambda x: round(x.nunique()*100/unique_customer_count,2)),    
    Churn_counts = ('Churn_flag','sum'),
     Churn_rate = ('Churn_flag',lambda x: round(x.mean()*100,2)),
     ).reset_index().sort_values('Arming_Consistency_last15days',ascending=False))

In [None]:
pd.DataFrame(Customer_DF_6.groupby(['Best_partices_1']).agg(
    Unique_Customers=('customer_id','nunique'),
    Share_of_Customers=('customer_id',lambda x: x.nunique()*100/unique_customer_count),
    Churn_counts = ('Churn_flag','sum'),
     Churn_rate = ('Churn_flag',lambda x: x.mean()*100),
     ).reset_index().sort_values('Best_partices_1',ascending=False))

In [None]:
pd.DataFrame(Customer_DF_6.groupby(['Best_partices_2']).agg(
    Unique_Customers=('customer_id','nunique'), 
    Share_of_Customers=('customer_id',lambda x: x.nunique()*100/unique_customer_count),
    Churn_counts = ('Churn_flag','sum'),
     Churn_rate = ('Churn_flag',lambda x: round(x.mean()*100,2)),
     ).reset_index().sort_values('Best_partices_2',ascending=False))

In [None]:
pd.DataFrame(Customer_DF_6.groupby(['Best_partices_3']).agg(
    Unique_Customers=('customer_id','nunique'), 
    Churn_counts = ('Churn_flag','sum'),
     Churn_rate = ('Churn_flag',lambda x: x.mean()*100),
     ).reset_index().sort_values('Best_partices_3',ascending=False))

In [None]:
pd.DataFrame(Customer_DF_6.groupby(['Best_partices_4']).agg(
    Unique_Customers=('customer_id','nunique'), 
    Churn_counts = ('Churn_flag','sum'),
     Churn_rate = ('Churn_flag',lambda x: x.mean()*100),
     ).reset_index().sort_values('Best_partices_4',ascending=False))

In [None]:
pd.DataFrame(Customer_DF_6.groupby(['Best_partices_5']).agg(
    Unique_Customers=('customer_id','nunique'), 
    Churn_counts = ('Churn_flag','sum'),
     Churn_rate = ('Churn_flag',lambda x: x.mean()*100),
     ).reset_index().sort_values('Best_partices_5',ascending=False))

In [None]:
pd.DataFrame(Customer_DF_6.groupby(['Best_partices_6']).agg(
    Unique_Customers=('customer_id','nunique'), 
    Churn_counts = ('Churn_flag','sum'),
     Churn_rate = ('Churn_flag',lambda x: x.mean()*100),
     ).reset_index().sort_values('Best_partices_6',ascending=False))

In [None]:
pd.DataFrame(Customer_DF_6.groupby(['Best_Practice_All_flag']).agg(
    Unique_Customers=('customer_id','nunique'), 
    Churn_counts = ('Churn_flag','sum'),
     Churn_rate = ('Churn_flag',lambda x: x.mean()*100),
     ).reset_index().sort_values('Best_Practice_All_flag',ascending=False))

In [None]:
pd.DataFrame(Customer_DF_6.groupby(['TC_flag']).agg(
    Unique_Customers=('customer_id','nunique'), 
    Churn_counts = ('Churn_flag','sum'),
     Churn_rate = ('Churn_flag',lambda x: x.mean()*100),
     ).reset_index().sort_values('Unique_Customers',ascending=False))

In [None]:
pd.DataFrame(Customer_DF_6.groupby(['Camera_flag']).agg(
    Unique_Customers=('customer_id','nunique'), 
    Churn_counts = ('Churn_flag','sum'),
     Churn_rate = ('Churn_flag',lambda x: x.mean()*100),
     ).reset_index().sort_values('Unique_Customers',ascending=False))

In [None]:
pd.DataFrame(Customer_DF_6.groupby(['Camera_TC_issue_last3M_flag']).agg(
    Unique_Customers=('customer_id','nunique'), 
    Churn_counts = ('Churn_flag','sum'),
     Churn_rate = ('Churn_flag',lambda x: x.mean()*100),
     ).reset_index().sort_values('Unique_Customers',ascending=False))

In [None]:
pd.DataFrame(Customer_DF_6.groupby(['Best_Practice_All_flag']).agg(
    Unique_Customers=('customer_id','nunique'), 
    Churn_counts = ('Churn_flag','sum'),
     Churn_rate = ('Churn_flag',lambda x: x.mean()*100),
     ).reset_index().sort_values('Unique_Customers',ascending=False))

In [None]:
pd.DataFrame(Customer_DF_5.groupby(['Camera_and_any_TC_issue']).agg(
    Unique_Customers=('customer_id','nunique'), 
    Churn_counts = ('Churn_flag','sum'),
     Churn_rate = ('Churn_flag',lambda x: x.mean()*100),
     ).reset_index().sort_values('Unique_Customers',ascending=False))

In [None]:
Customer_DF_6.info()

In [None]:
Customer_DF_6.columns

In [None]:
feature_cols=[
       'Central_Station_Condition_Issue_count_last3M',
       'Critical_System_Issue_count_last3M', 'Engagement_Issue_count_last3M',
       'System_Condition_Issue_count_last3M',
       'Central_Station_Condition_Issue_duration_hours_sum_last3M',
       'Critical_System_Issue_duration_hours_sum_last3M',
       'Engagement_Issue_duration_hours_sum_last3M',
       'System_Condition_Issue_duration_hours_sum_last3M', 'TC_flag',
       'Camera_flag', 'Camera_TC_issue_last3M_flag', 
        'Camera_and_any_TC_issue',
       'Best_partices_1', 'Best_partices_2', 'Best_partices_3',
       'Best_partices_4', 'Best_partices_5', 'Best_partices_6',
       'Best_Practice_All_flag',
       'sum_arm_commands_last15days', 'sum_disarm_commands_last15days',
       'number_days_arming_disarming_last15days',
       'Arming_Consistency_last15days', 'Arming_flag_last15days']

In [None]:
Customer_DF_6.shape

In [None]:
Customer_DF_6['Camera_flag'].value_counts()

In [None]:
Customer_DF_6['Camera_and_any_TC_issue'].value_counts()

In [None]:
Customer_DF_6['Camera_TC_issue_last3M_flag'].value_counts()

In [None]:
Customer_DF_sub=Customer_DF_6['TC_flag']==1 & Customer_DF_6['TC_flag']==1 

In [None]:
from sklearn.model_selection import train_test_split


def data_splitting(dataset=Customer_DF_6,feature_cols=[],Target=[],testsize=0.3):
        
    X=dataset[feature_cols]
    Y=dataset[Target]
    X1=pd.get_dummies(X,prefix_sep='__')
    X_train, X_validation, Y_train, Y_validation = train_test_split(X1, Y, test_size=testsize, random_state=11,stratify=Y)
    final_feature_columns=X_train.columns.tolist()
    print()
    print("Data splitting Completed.")
    return X_train,X_validation,Y_train,Y_validation,final_feature_columns

In [None]:
Target_col='Churn_flag'
X_train,X_validation,Y_train,Y_validation,final_feature_columns=data_splitting(
dataset=Customer_DF_6,feature_cols=feature_cols,Target=Target_col,testsize=0.3)

In [None]:
# from sklearn.preprocessing import OneHotEncoder

# X_train=pd.get_dummies(Customer_DF_6[feature_cols],prefix_sep='__')

# Y_train=Customer_DF_6['Churn_flag']

In [None]:
X_train.info()

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=5, min_samples_leaf=200)

dt.fit(X_train,Y_train)

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import make_scorer



def lift_calc_pd(model, X_test, y_test,target_column): # , model_path
    """ this functions takes in the model
    test features, targets, will return the
    lift dataframe
    """
    prediction = model.predict_proba(X_test)
    print("output shape", prediction.shape)
    print("y_test shape:", y_test.shape)
    #Total_Actual_positive_targets=sum(y_test[target_column])
    #print('Total_Actual_positive_targets:',Total_Actual_positive_targets)


    prediction_df = pd.DataFrame(
    prediction, columns=['prob_of_predict_0', 'prob_of_predict_1'])
    y_df = pd.DataFrame(y_test, columns=[target_column])
    prediction_with_label_df = y_df.join(prediction_df.set_index(y_df.index))


    # Calculate lift on validation data
    v1 = prediction_with_label_df[[target_column, 'prob_of_predict_1']]
    v1.sort_values(by=['prob_of_predict_1'], inplace=True)
    v1['Qcut'] = pd.qcut(v1['prob_of_predict_1'].rank(
    method='first'), q=20, labels=False)
    v1['SD'] = 20-(v1['Qcut'].astype(int))


    vl1 = v1.drop(['Qcut'], axis=1)
    vl1['Freq'] = vl1.groupby('SD')[target_column].transform(np.sum)
    vl2 = vl1.drop([target_column, 'prob_of_predict_1'], axis=1).drop_duplicates()
    vl2.sort_values(by=['SD'], inplace=True)
    vl2['Cumm Sum'] = vl2['Freq'].cumsum()
    vl2['Lift'] = vl2['Freq'].div(vl2['Freq'].sum(), axis=0).div(0.05)
    vl2['Cumm Lift'] = vl2['Cumm Sum'].div(vl2['Freq'].sum(), axis=0).div(vl2['SD']/20)
    vl2['Capture Rate'] = vl2['Lift']*5
    vl2['Cummu Capture Rate'] = vl2['Capture Rate'].cumsum()
    # vl2['Capture_Rate_Defintion']= vl2['Freq']*100/Total_Actual_positive_targets
    # vl2['Cummu_Capture_Rate_Defintion']= vl2['Capture_Rate_Defintion'].cumsum()
    return pd.DataFrame(vl2)

In [None]:
Target_col

In [None]:
val_DF_metric=lift_calc_pd(model=dt,X_test=X_validation,y_test=Y_validation,target_column=Target_col)

In [None]:
val_DF_metric

In [None]:
importances = dt.feature_importances_

forest_importances = pd.Series(importances, index=feature_cols)
forest_importances.plot.bar()


In [None]:


# from matplotlib import pyplot as plt

# fig = plt.figure(figsize=(45,45))
# _ = tree.plot_tree(dt, 
#                    feature_names=feature_cols,  
#                    class_names=target_names,
#                    filled=True)

In [None]:
target_names=['No_Churn','Churn']
fig = plt.figure(figsize=(30,30))
import graphviz
dot_data = tree.export_graphviz(dt, out_file=None, 
                                feature_names=feature_cols,  
                                class_names=target_names,
                                filled=True)

# Draw graph
graph = graphviz.Source(dot_data, format="png") 
graph

In [None]:
from sklearn.tree import _tree

def get_rules(tree, feature_names, class_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]

    paths = []
    path = []
    
    def recurse(node, path, paths):
        
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            p1, p2 = list(path), list(path)
            p1 += [f"({name} <= {np.round(threshold, 3)})"]
            recurse(tree_.children_left[node], p1, paths)
            p2 += [f"({name} > {np.round(threshold, 3)})"]
            recurse(tree_.children_right[node], p2, paths)
        else:
            path += [(tree_.value[node], tree_.n_node_samples[node])]
            paths += [path]
            
    recurse(0, path, paths)

    # sort by samples count
    samples_count = [p[-1][1] for p in paths]
    ii = list(np.argsort(samples_count))
    paths = [paths[i] for i in reversed(ii)]
    
    rules = []
    for path in paths:
        rule = "if "
        
        for p in path[:-1]:
            if rule != "if ":
                rule += " and "
            rule += str(p)
        rule += " then "
        if class_names is None:
            rule += "response: "+str(np.round(path[-1][0][0][0],3))
        else:
            classes = path[-1][0][0]
            l = np.argmax(classes)
            rule += f"class: {class_names[l]} (proba: {np.round(100.0*classes[l]/np.sum(classes),2)}%)"
        rule += f" | based on {path[-1][1]:,} samples"
        rules += [rule]
        
    return rules

In [None]:
target_names=['No_Churn','Churn']

rules=get_rules(dt, X_train.columns, target_names)

In [None]:
rules_df= []
#%%
for r in rules:
    rules_df.append(r)

In [None]:
from sklearn import tree

text_representation = tree.export_text(dt)
print(text_representation)