In [None]:
#### import global modules
import os
import sys
import pandas as pd
from pathlib import Path
from yaml import safe_load
import google.oauth2.credentials
from google.cloud import bigquery

# Set global vars
pth_project = Path(os.getcwd().split('notebooks')[0])
pth_data = pth_project / 'data'
pth_queries = pth_project / 'core' / 'queries'
pth_creds = pth_project / 'conf' / 'local' / 'project_config.yaml'
sys.path.insert(0, str(pth_project))
d_project_config = safe_load(pth_creds.open())
# d_params = safe_load((pth_project / 'core' / 'parameters' / 'common.yaml').open())['data_extract']

# import local modules
from core.utils.gcp import connect_bq_services
# from core.etl.extract import extract_bq_data, extract_pr_codes, format_conv_df, filter_convs

# Connect to google services
bq_client = connect_bq_services(d_project_config['gcp-project-name'])
pd.options.display.max_rows = 100

In [None]:
def extract_bq_data(bq_client, sql=None, pth_query=None):
    if sql is not None:
        df = bq_client.query(sql).to_dataframe()
    elif pth_query is not None:
        sql = pth_query.read_text()
        df = bq_client.query(sql).to_dataframe()
    else:
        raise ValueError('`sql` or `pth_query` should be set')  
    return df

In [None]:
Customer_Details_Query='''

SELECT customer_id,dealer_customer_id,primary_login_id,dealer_name,join_date,account_type_name,customer_type_name,primary_email,primary_phone,last_updt_ts,suspended_date_utc
FROM `cio-datahub-enterprise-pr-183a.src_adc.bq_customer_account_details` 
where last_updt_ts >='2022-08-01'  and last_updt_ts < '2022-09-01'  
and account_type_name!='Standalone'
and customer_type_name='Customer'
QUALIFY ROW_NUMBER() OVER (PARTITION BY customer_id ORDER BY last_updt_ts DESC) = 1
order by dealer_name,join_date,customer_id


'''

In [None]:
ADC_Aug2022_CustomerBase= extract_bq_data(bq_client,sql=Customer_Details_Query)

In [None]:
# cols_required=['customer_id','dealer_customer_id',primary_login_id,dealer_name,join_date,account_type_name,customer_type_name,primary_email,primary_phone]

In [None]:
# ADC_Aug2022_CustomerBase['suspended_date_utc'].value_counts()

In [None]:
ADC_Aug2022_CustomerBase.shape[0]-ADC_Aug2022_CustomerBase['suspended_date_utc'].isna().sum()

In [None]:
# 100-(ADC_Aug2022_CustomerBase['suspended_date_utc'].isna().sum()*100/ADC_Aug2022_CustomerBase.shape[0])

In [None]:
# ADC_Aug2022_CustomerBase['suspended_date_utc'].head(10)

In [None]:
ADC_Aug2022_CustomerBase.info()

In [None]:
Query_Termination_details='''

select * from `cio-datahub-enterprise-pr-183a.src_adc.bq_customer_account_terminated_details`
Where term_date >='2022-08-01' and term_date < '2022-09-01'
and account_type_name!='Standalone'
and customer_type_name='Customer'
QUALIFY ROW_NUMBER() OVER (PARTITION BY customer_id ORDER BY last_updt_ts DESC) = 1



'''

In [None]:
Termination_DF_Aug2022= extract_bq_data(bq_client,sql=Query_Termination_details)

In [None]:
Termination_DF_Aug2022.info()

In [None]:
Termination_DF_Aug2022['Churn']=1

In [None]:
Termination_DF_Aug2022['Churn'].sum()

In [None]:
Termination_DF_Aug2022['term_reason'].value_counts()

In [None]:
cols_req_termination=['customer_id','term_date','term_reason','Churn']
Termination_DF_Aug2022_sub=Termination_DF_Aug2022[cols_req_termination]

In [None]:
Termination_DF_Aug2022_sub.head()

In [None]:
Termination_DF_Aug2022_sub['term_date'].min(),Termination_DF_Aug2022_sub['term_date'].max()

In [None]:
ADC_Aug2022_CustomerBase_merge=ADC_Aug2022_CustomerBase.merge(Termination_DF_Aug2022_sub,how='left',on='customer_id')

In [None]:
ADC_Aug2022_CustomerBase_merge.head()

In [None]:
ADC_Aug2022_CustomerBase_merge['Churn'].sum()

In [None]:
ADC_Aug2022_CustomerBase_merge['Churn'].fillna(0,inplace=True)

In [None]:
ADC_Aug2022_CustomerBase_merge['Churn'].value_counts(normalize=True)*100

In [None]:
TC_Query='''


select * from `ADC.Trouble_Condition_Group_last3months_Aug2022`


'''

In [None]:
TC_Data=extract_bq_data(bq_client,sql=TC_Query)

In [None]:
TC_Data['TC_data_flag']=1

In [None]:
TC_Data.info()

In [None]:
ADC_Aug2022_CustomerBase_Full_data=ADC_Aug2022_CustomerBase_merge.merge(TC_Data,on='customer_id',how='left')

In [None]:
ADC_Aug2022_CustomerBase_Full_data.head()

In [None]:
ADC_Aug2022_CustomerBase_Full_data.info()

In [None]:
ADC_Aug2022_CustomerBase_Full_data['TC_data_flag'].sum()

In [None]:
ADC_Aug2022_CustomerBase_Full_data['Central_Station_Condition_Issue_count_last3M'].fillna(0,inplace=True)
ADC_Aug2022_CustomerBase_Full_data['Critical_System_Issue_count_last3M'].fillna(0,inplace=True)
ADC_Aug2022_CustomerBase_Full_data['Engagement_Issue_count_last3M'].fillna(0,inplace=True)
ADC_Aug2022_CustomerBase_Full_data['System_Condition_Issue_count_last3M'].fillna(0,inplace=True)
ADC_Aug2022_CustomerBase_Full_data['Central_Station_Condition_Issue_duration_hours_sum_last3M'].fillna(0,inplace=True)
ADC_Aug2022_CustomerBase_Full_data['Critical_System_Issue_duration_hours_sum_last3M'].fillna(0,inplace=True)
ADC_Aug2022_CustomerBase_Full_data['Engagement_Issue_duration_hours_sum_last3M'].fillna(0,inplace=True)
ADC_Aug2022_CustomerBase_Full_data['System_Condition_Issue_duration_hours_sum_last3M'].fillna(0,inplace=True)


In [None]:

# X=ADC_Aug2022_CustomerBase_Full_data['Central_Station_Condition_Issue_count_last3M'].to_numpy().reshape(-1,1)
# y=ADC_Aug2022_CustomerBase_Full_data['Churn'].values
# mutual_info_classif(X, y, discrete_features=True)


In [None]:
# from sklearn.feature_selection import mutual_info_classif
# import numpy as np
# import gc

# TC_Cols=[
# 'Central_Station_Condition_Issue_count_last3M',
# 'Critical_System_Issue_count_last3M', 
# 'Engagement_Issue_count_last3M',
# 'System_Condition_Issue_count_last3M',
# 'Central_Station_Condition_Issue_duration_hours_sum_last3M',
# 'Critical_System_Issue_duration_hours_sum_last3M',
# 'Engagement_Issue_duration_hours_sum_last3M',
# 'System_Condition_Issue_duration_hours_sum_last3M'
# ]

# MI_score_features_name = []
# MI_score_features_value = []
# Y=ADC_Aug2022_CustomerBase_Full_data['Churn'].values

# for datacols in TC_Cols:
#         X= ADC_Aug2022_CustomerBase_Full_data[datacols].fillna(0).to_numpy().reshape(-1,1)
#         feature_scores= mutual_info_classif(X,Y,random_state=0)
#         value= np.round(feature_scores[0],4)
#         MI_score_features_value.append(value)
#         MI_score_features_name.append(datacols)
#         print(datacols,':',value)
#         gc.collect()
#         MI_data = pd.DataFrame(
#         {'MI_score_features_name': MI_score_features_name,
#         'MI_score_features_value': MI_score_features_value,
#          'MI_features_datatype': 'numeric'})

# MI_data.sort_values(by=['MI_score_features_value'],ascending=False)
#         # .to_excel(data_processed+"Numcols_Mutual_information_202101.xlsx",index=False)
# print("Mutual Information calculation for Numerical columns completed.")



In [None]:
System_Check_Query_last3M='''


select * from `divgpras-pr-579355.ADC.System_Check_last3months_May2022_Aug2022`


'''

In [None]:
System_Check_Query_last1M='''


select * from `divgpras-pr-579355.ADC.System_Check_last1month_Jul2022_Aug2022`


'''

In [None]:
System_Check_Data_last3M=extract_bq_data(bq_client,sql=System_Check_Query_last3M)

In [None]:
System_Check_Data_last1M=extract_bq_data(bq_client,sql=System_Check_Query_last1M)

In [None]:
System_Check_Data_last3M.info()

In [None]:
System_Check_Data_last3M.head()

In [None]:
System_Check_Data_last3M['System_Check_data_last3M_flag']=1

In [None]:
System_Check_Data_last1M['System_Check_data_last1M_flag']=1

In [None]:
ADC_Aug2022_CustomerBase_Full_data_2=ADC_Aug2022_CustomerBase_Full_data.merge(System_Check_Data_last3M,on='customer_id',how='left')

In [None]:
ADC_Aug2022_CustomerBase_Full_data_1=ADC_Aug2022_CustomerBase_Full_data_2.merge(System_Check_Data_last1M,on='customer_id',how='left')

In [None]:
ADC_Aug2022_CustomerBase_Full_data_1.info()

In [None]:
ADC_Aug2022_CustomerBase_Full_data_1.isna().sum()*100/ADC_Aug2022_CustomerBase_Full_data_1.shape[0]

In [None]:
feature_cols_last3M = [col for col in ADC_Aug2022_CustomerBase_Full_data_1.columns if '_last3M' in col]
feature_cols_last1M = [col for col in ADC_Aug2022_CustomerBase_Full_data_1.columns if '_last1M' in col]

feature_cols_previous_months=feature_cols_last3M+feature_cols_last1M

In [None]:
ADC_Aug2022_CustomerBase_Full_data_1.update(ADC_Aug2022_CustomerBase_Full_data_1[feature_cols_previous_months].fillna(0))

In [None]:
ADC_Aug2022_CustomerBase_Full_data_1.info()

In [None]:
ADC_Aug2022_CustomerBase_Full_data_1['Churn']=ADC_Aug2022_CustomerBase_Full_data_1['Churn'].astype('category')

In [None]:
from sklearn.feature_selection import mutual_info_classif
import numpy as np
import gc



MI_score_features_name = []
MI_score_features_value = []
Y=ADC_Aug2022_CustomerBase_Full_data_1['Churn'].values

for datacols in feature_cols_previous_months:
        X= ADC_Aug2022_CustomerBase_Full_data_1[datacols].fillna(0).to_numpy().reshape(-1,1)
        feature_scores= mutual_info_classif(X,Y,random_state=0)
        value= np.round(feature_scores[0],4)*100
        MI_score_features_value.append(value)
        MI_score_features_name.append(datacols)
        print(datacols,':',value)
        gc.collect()
        
print("Mutual Information calculation for Numerical columns completed.")

In [None]:
MI_data = pd.DataFrame({'MI_score_features_name': MI_score_features_name,
        'MI_score_features_value': MI_score_features_value,
         'MI_features_datatype': 'numeric'})

# MI_data.sort_values(by=['MI_score_features_value'],ascending=False)
        # .to_excel(data_processed+"Numcols_Mutual_information_202101.xlsx",index=False)


In [None]:
MI_data=MI_data.sort_values(by=['MI_score_features_value'],ascending=False)

In [None]:
MI_data['MI_score_features_value']=MI_data['MI_score_features_value'].round(decimals=6)

In [None]:
MI_data

In [None]:
config= bigquery.job.LoadJobConfig()

# config._properties['timePartitioning'] = {'field': 'Month_Year'}
config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE

Table_BQ = 'ADC.Mutual_Info_table'

bq_table_instance= bq_client.load_table_from_dataframe(MI_data, Table_BQ,job_config=config)

In [None]:
from scipy import stats

corr_list_name = []
corr_list_correlation = []
corr_list_pvalue = []

y = ADC_Aug2022_CustomerBase_Full_data_1['Churn'].astype(float)


for column in feature_cols_previous_months:
    x=ADC_Aug2022_CustomerBase_Full_data_1[column]
    corr = stats.pointbiserialr(list(x), list(y))
    corr_list_name.append(column)
    corr_list_correlation.append(corr[0])
    corr_list_pvalue.append(corr[1])
    print(column,':',corr)
    gc.collect()



In [None]:
BI_data = pd.DataFrame({'BC_score_features_name': corr_list_name,
        'BC_score_correlation': corr_list_correlation,
        'BC_score_pvalue': corr_list_pvalue,
         'BC_features_datatype': 'numeric'})

In [None]:
BI_data